189 lines
5.2 KiB
Go
189 lines
5.2 KiB
Go
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// collect discovers every drive, queries SMART, attaches controller data, and
|
|
// scores it.
|
|
func collect() ([]*Drive, int64) {
|
|
host := hostname()
|
|
if app != nil && app.config != nil && app.config.Hostname != "" {
|
|
host = app.config.Hostname
|
|
}
|
|
collectedAt := time.Now().UTC().Format("2006-01-02T15:04:05Z")
|
|
tsNs := time.Now().Unix() * 1e9
|
|
|
|
st := newSmartTool()
|
|
ctrl := controllerIndex()
|
|
devices := st.scan()
|
|
|
|
// Fallback: no scan-open results but we do have controller drives -> probe
|
|
// a base device by megaraid index.
|
|
if len(devices) == 0 && len(ctrl) > 0 {
|
|
base := findBaseDev()
|
|
ids := make([]string, 0, len(ctrl))
|
|
for id := range ctrl {
|
|
ids = append(ids, id)
|
|
}
|
|
sort.Strings(ids)
|
|
for _, id := range ids {
|
|
for _, tmpl := range megaraidDtypes {
|
|
devices = append(devices, scanned{
|
|
path: base, dtype: fmt.Sprintf(tmpl, id), megaraidN: id,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
var drives []*Drive
|
|
matched := map[string]bool{} // Controller IDs covered by a smartctl device.
|
|
for _, sc := range devices {
|
|
if sc.megaraidN != "" {
|
|
matched[sc.megaraidN] = true
|
|
}
|
|
d := &Drive{
|
|
CollectedAt: collectedAt,
|
|
Hostname: host,
|
|
DeviceID: sc.megaraidN,
|
|
}
|
|
|
|
ok := st.querySmart(sc.path, sc.dtype, d)
|
|
|
|
// Skip iSCSI LUNs and RAID virtual disks; they are not physical drives.
|
|
if isPseudoDevice(d) {
|
|
continue
|
|
}
|
|
|
|
// Attach controller-side data by megaraid index == DeviceID.
|
|
if sc.megaraidN != "" {
|
|
if cd, found := ctrl[sc.megaraidN]; found {
|
|
applyController(d, cd)
|
|
}
|
|
}
|
|
|
|
// Determine whether real SMART attribute data was obtained.
|
|
d.HaveSmart = ok && d.Model != "" && (d.PowerOnHours != nil ||
|
|
d.WearPctRemaining != nil ||
|
|
d.SmartHealth == "PASSED" || d.SmartHealth == "FAILED" ||
|
|
d.SmartHealth == "PASSED_BY_ATTR")
|
|
|
|
finalizeDerived(d)
|
|
drives = append(drives, d)
|
|
}
|
|
|
|
// Emit controller-only drives: physical drives the controller reports but
|
|
// smartctl cannot reach (e.g. NVMe behind a PERC). Health comes entirely
|
|
// from the controller (Status, predictive-failure, media/other counters).
|
|
ids := make([]string, 0, len(ctrl))
|
|
for id := range ctrl {
|
|
ids = append(ids, id)
|
|
}
|
|
sort.Strings(ids)
|
|
for _, id := range ids {
|
|
if matched[id] {
|
|
continue
|
|
}
|
|
d := &Drive{CollectedAt: collectedAt, Hostname: host, DeviceID: id}
|
|
applyController(d, ctrl[id])
|
|
if isPseudoDevice(d) {
|
|
continue
|
|
}
|
|
d.HaveSmart = false
|
|
finalizeDerived(d)
|
|
drives = append(drives, d)
|
|
}
|
|
return drives, tsNs
|
|
}
|
|
|
|
// applyController fills controller-side fields and uses MegaCLI/storcli inquiry
|
|
// as an identity fallback when smartctl passthrough failed.
|
|
func applyController(d *Drive, cd ctrlDrive) {
|
|
d.Enclosure = cd.Enclosure
|
|
d.Slot = cd.Slot
|
|
d.MediaErrCtrl = cd.MediaErr
|
|
d.OtherErrCtrl = cd.OtherErr
|
|
d.PredictiveFailureCtrl = cd.Predictive
|
|
d.SmartAlertCtrl = cd.SmartAlert
|
|
d.FwState = cd.FwState
|
|
|
|
// Identity fallback for when smartctl could not read the drive. Prefer the
|
|
// structured fields (perccli2); else split the legacy single-line Inquiry.
|
|
if cd.Model != "" || cd.Serial != "" || cd.Firmware != "" {
|
|
if d.Serial == "" {
|
|
d.Serial = cd.Serial
|
|
}
|
|
if d.Model == "" {
|
|
d.Model = cd.Model
|
|
}
|
|
if d.Firmware == "" {
|
|
d.Firmware = cd.Firmware
|
|
}
|
|
} else if cd.Inquiry != "" {
|
|
// Legacy MegaCLI "Inquiry Data" packs "<serial> <model...> <firmware>" on
|
|
// one line, where the model itself can contain spaces and the token count
|
|
// varies. Serial is always first and the firmware revision always last, so
|
|
// anchor on those and treat everything between as the model.
|
|
parts := strings.Fields(cd.Inquiry)
|
|
if d.Serial == "" && len(parts) >= 1 {
|
|
d.Serial = parts[0]
|
|
}
|
|
if d.Firmware == "" && len(parts) >= 2 {
|
|
d.Firmware = parts[len(parts)-1]
|
|
}
|
|
if d.Model == "" && len(parts) >= 3 {
|
|
d.Model = strings.Join(parts[1:len(parts)-1], " ")
|
|
}
|
|
}
|
|
if d.Rotation == "" {
|
|
d.Rotation = cd.Rotation
|
|
}
|
|
if d.TempC == nil {
|
|
d.TempC = cd.TempC
|
|
}
|
|
}
|
|
|
|
// finalizeDerived computes defect_total, power_on_years, and the risk score.
|
|
func finalizeDerived(d *Drive) {
|
|
// Aggregate drive-attributable defect counters. nil only when NONE was
|
|
// readable, so NO_DATA rows stay blank instead of showing a misleading 0.
|
|
defectParts := []*int{
|
|
d.Reallocated, d.Pending, d.Uncorrectable,
|
|
d.ReportedUncorrect, d.RuntimeBadBlocks, d.EndToEnd,
|
|
}
|
|
anyKnown := false
|
|
sum := 0
|
|
for _, p := range defectParts {
|
|
if p != nil {
|
|
anyKnown = true
|
|
sum += *p
|
|
}
|
|
}
|
|
if anyKnown {
|
|
d.DefectTotal = pInt(sum)
|
|
}
|
|
|
|
if d.PowerOnHours != nil && *d.PowerOnHours > 0 {
|
|
y := float64(*d.PowerOnHours) / 8760.0
|
|
d.PowerOnYears = pF(float64(int(y*100+0.5)) / 100) // Round to two decimals.
|
|
}
|
|
|
|
d.RiskScore, d.Recommendation, d.RiskReasons = scoreDrive(d)
|
|
}
|
|
|
|
// findBaseDev returns a real base block device to anchor the megaraid
|
|
// passthrough fallback probe, skipping loop and md devices and defaulting to
|
|
// /dev/sda when lsblk yields nothing usable.
|
|
func findBaseDev() string {
|
|
out := run("lsblk", "-dno", "NAME")
|
|
for _, ln := range strings.Split(out, "\n") {
|
|
name := strings.TrimSpace(ln)
|
|
if name != "" && !strings.Contains(name, "loop") && !strings.HasPrefix(name, "md") {
|
|
return "/dev/" + name
|
|
}
|
|
}
|
|
return "/dev/sda"
|
|
}
|