drive-health-metrics/controller.go
James Coleman ddafa90a02
Some checks failed
Go package / build (push) Has been cancelled
first commit
2026-06-22 17:16:34 -05:00

349 lines
11 KiB
Go

package main
import (
"regexp"
"strconv"
"strings"
)
// ctrlDrive holds the RAID-controller-side view of one physical drive — data
// smartctl cannot see (predictive-failure, firmware state, controller media/
// other error counters, physical enclosure:slot). Keyed for matching to a
// smartctl megaraid passthrough by DeviceID (== the megaraid,N index).
type ctrlDrive struct {
DeviceID string
Enclosure string
Slot string
MediaErr int
OtherErr int
Predictive int
SmartAlert bool
FwState string
TempC *int
Inquiry string // Inquiry is the legacy single-line MegaCLI/storcli inquiry (serial model fw).
Model string // Model is the structured identity (perccli2); used for controller-only drives.
Serial string
Firmware string
Rotation string // Rotation is "SSD"/"NVMe" derived from controller media/interface, when known.
}
// controllerIndex enumerates all RAID controllers found, preferring modern
// tools (storcli/perccli) then MegaCLI, and returns a DeviceID->ctrlDrive map.
// If no controller CLI is present (plain HBA / onboard SATA / NVMe) it returns
// an empty map — that's fine, smartctl still covers those drives directly.
func controllerIndex() map[string]ctrlDrive {
idx := map[string]ctrlDrive{}
// perccli2 (8.x) is JSON-native. Its plain-text "show all" adds a second
// status column that breaks positional parsing, so query JSON ('J') and use
// the dedicated parser. Tried first since it covers the newest controllers.
for _, bin := range []string{"perccli2", "/opt/MegaRAID/perccli2/perccli2"} {
p := lookPath(bin)
if p == "" {
continue
}
drives := parsePerccli2(run(p, "/call/eall/sall", "show", "all", "J"))
for _, cd := range drives {
mergeCtrl(idx, cd)
}
if len(drives) > 0 {
break
}
}
// storcli / perccli (classic) share the same text "show all" layout (perccli
// is Dell's rebrand). Try each installed binary until one returns drives, so
// a host with several tools present still resolves.
for _, bin := range []string{"storcli64", "storcli", "perccli64", "perccli",
"/opt/MegaRAID/storcli/storcli64", "/opt/MegaRAID/perccli/perccli64"} {
p := lookPath(bin)
if p == "" {
continue
}
drives := parseStorcli(run(p, "/call/eall/sall", "show", "all"))
for _, cd := range drives {
mergeCtrl(idx, cd)
}
if len(drives) > 0 {
break
}
}
// MegaCLI (older controllers). Same try-until-data approach.
for _, bin := range []string{"MegaCli64", "MegaCli", "megacli",
"/opt/MegaRAID/MegaCli/MegaCli64", "/usr/sbin/megacli"} {
p := lookPath(bin)
if p == "" {
continue
}
drives := parseMegacliPDList(run(p, "-PDList", "-aAll"))
for _, cd := range drives {
mergeCtrl(idx, cd)
}
if len(drives) > 0 {
break
}
}
return idx
}
// mergeCtrl records cd under its DeviceID, keeping the first writer so the
// preferred tool (queried earlier) wins and a later tool can't clobber it.
// Entries without a DeviceID are dropped — they can't be matched to a drive.
func mergeCtrl(idx map[string]ctrlDrive, cd ctrlDrive) {
if cd.DeviceID == "" {
return
}
if _, exists := idx[cd.DeviceID]; !exists {
idx[cd.DeviceID] = cd
}
}
// afterColon returns the trimmed text following the first colon, or "". It reads
// the "Key : Value" lines MegaCLI/storcli emit.
func afterColon(s string) string {
if i := strings.Index(s, ":"); i >= 0 {
return strings.TrimSpace(s[i+1:])
}
return ""
}
// parseMegacliPDList parses `MegaCli -PDList -aAll`. Record boundary is the
// "Enclosure Device ID" line.
func parseMegacliPDList(text string) []ctrlDrive {
var drives []ctrlDrive
var cur ctrlDrive
have := false
flush := func() {
if have && (cur.DeviceID != "" || cur.Slot != "") {
drives = append(drives, cur)
}
}
for _, raw := range strings.Split(text, "\n") {
s := strings.TrimSpace(raw)
switch {
case strings.HasPrefix(s, "Enclosure Device ID"):
flush()
cur = ctrlDrive{Enclosure: afterColon(s)}
have = true
case strings.HasPrefix(s, "Slot Number"):
cur.Slot = afterColon(s)
case strings.HasPrefix(s, "Device Id"):
cur.DeviceID = afterColon(s)
case strings.HasPrefix(s, "Media Error Count"):
cur.MediaErr = atoiSafe(afterColon(s))
case strings.HasPrefix(s, "Other Error Count"):
cur.OtherErr = atoiSafe(afterColon(s))
case strings.HasPrefix(s, "Predictive Failure Count"):
cur.Predictive = atoiSafe(afterColon(s))
// MegaCLI phrases this as "Drive has flagged a S.M.A.R.T alert : No";
// the older "S.M.A.R.T alert flagged by drive" form is kept for safety.
case strings.HasPrefix(s, "Drive has flagged a S.M.A.R.T alert"),
strings.HasPrefix(s, "S.M.A.R.T alert flagged by drive"):
cur.SmartAlert = strings.Contains(s, "Yes")
case strings.HasPrefix(s, "Firmware state"):
cur.FwState = afterColon(s)
case strings.HasPrefix(s, "Drive Temperature"):
if m := regexp.MustCompile(`(\d+)\s*C`).FindStringSubmatch(s); m != nil {
cur.TempC = pInt(atoiSafe(m[1]))
}
case strings.HasPrefix(s, "Inquiry Data"):
cur.Inquiry = afterColon(s)
}
}
flush()
return drives
}
// parseStorcli parses `storcli /call/eall/sall show all`. A physical drive is
// introduced by a bare summary header ("Drive /c0/e64/s0 :") followed by a
// table row ("64:0 22 Onln ...") that carries the DID (== the megaraid index
// smartctl uses) and the controller state. The same drive then repeats sub-
// section headers ("Drive .../s0 - Detailed Information :", "... State :",
// "... Device attributes :") that must NOT open a new record — only the bare
// summary header does — so detail fields ("Key = Value") accumulate into one
// record across those sections.
func parseStorcli(text string) []ctrlDrive {
var drives []ctrlDrive
var cur ctrlDrive
have := false
// hdr matches only the bare summary header (path then ": " at end), not the
// "- Detailed Information"/"State"/"Device attributes" sub-section headers.
hdr := regexp.MustCompile(`^Drive /c\d+/e(\d+)/s(\d+)\s*:$`)
// row matches the summary table data row "EID:Slt DID State ..."; this
// storcli version reports the DID here, never as a "DID = N" line.
row := regexp.MustCompile(`^(\d+):(\d+)\s+(\d+)\s+(\S+)`)
flush := func() {
if have && (cur.DeviceID != "" || cur.Slot != "") {
drives = append(drives, cur)
}
}
kv := func(s string) (string, string, bool) {
if i := strings.Index(s, "="); i >= 0 {
return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:]), true
}
return "", "", false
}
for _, raw := range strings.Split(text, "\n") {
s := strings.TrimSpace(raw)
// New drive record: only the bare summary header opens one.
if m := hdr.FindStringSubmatch(s); m != nil {
flush()
cur = ctrlDrive{Enclosure: m[1], Slot: m[2]}
have = true
continue
}
if !have {
continue
}
// Summary table row supplies the DID and controller state.
if m := row.FindStringSubmatch(s); m != nil && cur.DeviceID == "" {
cur.DeviceID = m[3]
cur.FwState = m[4]
continue
}
k, v, ok := kv(s)
if !ok {
continue
}
switch k {
case "DID":
cur.DeviceID = v
case "Media Error Count":
cur.MediaErr = atoiSafe(v)
case "Other Error Count":
cur.OtherErr = atoiSafe(v)
case "Predictive Failure Count":
cur.Predictive = atoiSafe(v)
case "S.M.A.R.T alert flagged by drive":
cur.SmartAlert = strings.EqualFold(v, "Yes")
case "Firmware state", "State":
if cur.FwState == "" {
cur.FwState = v
}
case "Drive Temperature":
if m := regexp.MustCompile(`(\d+)\s*C`).FindStringSubmatch(v); m != nil {
cur.TempC = pInt(atoiSafe(m[1]))
}
case "Model Number", "Manufacturer Identification":
if cur.Inquiry == "" {
cur.Inquiry = v
}
}
}
flush()
return drives
}
// parsePerccli2 parses `perccli2 /call/eall/sall show all J` (JSON). perccli2
// (8.x) renames the classic DID to PID and splits the single State column into
// State (RAID role: Conf/UConf/GHS/JBOD) and Status (health: Online/Offline/
// Failed/Missing); the latter is what maps to FwState. Drives nest under
// Controllers[].Response Data.Drives List[]; health counters sit directly in
// "Drive Detailed Information" (SAS/SATA) or under its "LU/NS Properties" for
// NVMe namespaces.
func parsePerccli2(text string) []ctrlDrive {
m := loadJSON(text)
if m == nil {
return nil
}
controllers, ok := m["Controllers"].([]interface{})
if !ok {
return nil
}
var drives []ctrlDrive
for _, c := range controllers {
cm, ok := c.(map[string]interface{})
if !ok {
continue
}
list, ok := jLeaf(cm, "Response Data", "Drives List").([]interface{})
if !ok {
continue
}
for _, it := range list {
dm, ok := it.(map[string]interface{})
if !ok {
continue
}
info := jObj(dm, "Drive Information")
if info == nil {
continue
}
cd := ctrlDrive{}
// Location + identity from the summary block.
if es := jStr(info, "EID:Slt"); es != "" {
if i := strings.Index(es, ":"); i >= 0 {
cd.Enclosure = strings.TrimSpace(es[:i])
cd.Slot = strings.TrimSpace(es[i+1:])
}
}
if pid := jInt(info, "PID"); pid != nil {
cd.DeviceID = strconv.Itoa(*pid)
}
cd.FwState = jStr(info, "Status") // Health verdict, not the RAID role.
cd.Model = jStr(info, "Model")
cd.Rotation = perccli2Rotation(jStr(info, "Intf"), jStr(info, "Med"))
// Detail block: identity fallbacks, temperature, error counters.
if detail := jObj(dm, "Drive Detailed Information"); detail != nil {
cd.Serial = jStr(detail, "Serial Number")
cd.Firmware = jStr(detail, "Firmware Revision Level")
if cd.Model == "" {
cd.Model = jStr(detail, "Model")
}
if t := jInt(detail, "Temperature(C)"); t != nil {
cd.TempC = t
}
// Counters live in detail (SAS/SATA) or LU/NS Properties (NVMe).
props := jObj(detail, "LU/NS Properties")
ci := func(key string) int {
if v := jInt(detail, key); v != nil {
return *v
}
if props != nil {
if v := jInt(props, key); v != nil {
return *v
}
}
return 0
}
cd.MediaErr = ci("Media Error Count")
cd.OtherErr = ci("Other Error Count")
cd.Predictive = ci("Predictive Failure Count")
}
if cd.DeviceID != "" || cd.Slot != "" {
drives = append(drives, cd)
}
}
}
return drives
}
// perccli2Rotation maps a perccli2 interface/media pair to a rotation label.
func perccli2Rotation(intf, med string) string {
switch {
case strings.EqualFold(intf, "NVMe"):
return "NVMe"
case strings.EqualFold(med, "SSD"):
return "SSD"
default:
return ""
}
}
// atoiSafe extracts the first integer found in s (leading sign allowed) and
// returns 0 when none is present, since controller output often wraps the number
// in units or surrounding labels.
func atoiSafe(s string) int {
s = strings.TrimSpace(s)
if m := regexp.MustCompile(`-?\d+`).FindString(s); m != "" {
if n, err := strconv.Atoi(m); err == nil {
return n
}
}
return 0
}