drive-health-metrics/smart_json.go
James Coleman ddafa90a02
Some checks failed
Go package / build (push) Has been cancelled
first commit
2026-06-22 17:16:34 -05:00

284 lines
7.9 KiB
Go

package main
import (
"fmt"
"strings"
)
// wearAttr maps vendor SSD-life attribute IDs to a source label. For all of
// these the normalized VALUE expresses "% life remaining".
var wearAttr = []struct {
id int
src string
}{
{173, "Micron/ID173"},
{202, "Intel/ID202"},
{231, "Intel/ID231"},
{177, "Samsung/ID177"},
{233, "Generic/ID233"},
}
// ataAttr is one parsed row of an ATA SMART attribute table, shared by the JSON
// and text paths. raw is the attribute's raw counter; value/worst are the
// vendor-normalized current/worst values.
type ataAttr struct {
value *int
worst *int
whenFailed string
raw *int
}
// attrRaw returns the raw counter for attribute id, or nil when absent.
func attrRaw(attrs map[int]ataAttr, id int) *int {
if a, ok := attrs[id]; ok {
return a.raw
}
return nil
}
// attrVal returns the normalized current value for attribute id, or nil.
func attrVal(attrs map[int]ataAttr, id int) *int {
if a, ok := attrs[id]; ok {
return a.value
}
return nil
}
// attrsFailed reports whether any attribute is flagged failed now or in the
// past — the basis for the PASSED_BY_ATTR/FAILED verdict when no explicit
// overall health result is available.
func attrsFailed(attrs map[int]ataAttr) bool {
for _, a := range attrs {
wf := strings.ToLower(a.whenFailed)
if wf == "now" || wf == "past" {
return true
}
}
return false
}
// applyAtaCounters maps the parsed ATA attribute table onto the defect, wear,
// reserve-block, and host-write fields shared by the JSON and text paths. Power-
// on hours fall back to attribute 9 only when not already set from a dedicated
// field. Path-specific fallbacks (power-cycle/temperature on text, SCSI/NVMe on
// JSON) stay with their callers.
func applyAtaCounters(attrs map[int]ataAttr, d *Drive) {
d.Reallocated = attrRaw(attrs, 5)
d.ReallocatedEvents = attrRaw(attrs, 196)
d.Pending = attrRaw(attrs, 197)
d.Uncorrectable = attrRaw(attrs, 198)
d.UdmaCrc = attrRaw(attrs, 199)
d.ReportedUncorrect = attrRaw(attrs, 187)
d.RuntimeBadBlocks = attrRaw(attrs, 183)
d.EndToEnd = attrRaw(attrs, 184)
if d.PowerOnHours == nil {
d.PowerOnHours = attrRaw(attrs, 9)
}
// Wear (vendor-normalized; VALUE = % remaining).
for _, w := range wearAttr {
if v := attrVal(attrs, w.id); v != nil {
d.WearPctRemaining = v
if a, ok := attrs[w.id]; ok {
d.WearPctWorst = a.worst
}
d.WearSrc = w.src
d.WearPctConsumed = pInt(100 - *v)
break
}
}
// Micron ID180 reserve blocks (VALUE = % remaining) and ID246 host writes.
if a, ok := attrs[180]; ok {
d.UnusedReservePct = a.value
}
if lba := attrRaw(attrs, 246); lba != nil && *lba > 0 {
d.HostWrittenTB = pF(float64(*lba) * 512.0 / 1e12)
}
}
// parseSmartJSON fills d from a smartctl -j object (ATA/SATA, SAS/SCSI, or NVMe).
func parseSmartJSON(j map[string]interface{}, d *Drive) {
if j == nil {
return
}
d.Model = first(jStr(j, "model_name"), jStr(j, "scsi_model_name"))
d.Serial = jStr(j, "serial_number")
d.Transport = jStr(j, "scsi_transport_protocol", "name")
d.Firmware = first(jStr(j, "firmware_version"), jStr(j, "scsi_revision"), jStr(j, "revision"))
if cap := jInt(j, "user_capacity", "bytes"); cap != nil && *cap > 0 {
d.Capacity = fmt.Sprintf("%.2f TB", float64(*cap)/1e12)
}
switch rr := jInt(j, "rotation_rate"); {
case rr != nil && *rr == 0:
d.Rotation = "SSD"
case rr != nil:
d.Rotation = fmt.Sprintf("%d rpm", *rr)
default:
d.Rotation = "SSD" // Absent rotation_rate: assume SSD; NVMe is corrected just below.
}
if strings.Contains(strings.ToLower(jStr(j, "device", "type")), "nvme") ||
jObj(j, "nvme_smart_health_information_log") != nil {
d.Rotation = "NVMe"
}
d.PowerOnHours = jInt(j, "power_on_time", "hours")
d.PowerCycleCount = jInt(j, "power_cycle_count")
d.TempC = jInt(j, "temperature", "current")
// ---- ATA attribute table ----
attrs := map[int]ataAttr{}
if table, ok := jLeaf(j, "ata_smart_attributes", "table").([]interface{}); ok {
for _, it := range table {
a, ok := it.(map[string]interface{})
if !ok {
continue
}
id := jInt(a, "id")
if id == nil {
continue
}
at := ataAttr{
value: jInt(a, "value"),
worst: jInt(a, "worst"),
whenFailed: jStr(a, "when_failed"),
}
// Prefer the leading integer of raw.string (raw.value overflows
// for some attributes); fall back to raw.value.
if rs := jStr(a, "raw", "string"); rs != "" {
if n, ok := firstInt(rs); ok {
at.raw = &n
}
}
if at.raw == nil {
at.raw = jInt(a, "raw", "value")
}
attrs[*id] = at
}
}
// ---- SMART health verdict ----
if passed := jBoolPtr(j, "smart_status", "passed"); passed != nil {
if *passed {
d.SmartHealth = "PASSED"
} else {
d.SmartHealth = "FAILED"
}
} else if len(attrs) > 0 {
if attrsFailed(attrs) {
d.SmartHealth = "FAILED"
} else {
d.SmartHealth = "PASSED_BY_ATTR"
}
} else {
d.SmartHealth = "UNKNOWN"
}
// Defect, wear, reserve, and host-write fields shared with the text path.
applyAtaCounters(attrs, d)
// ---- SCSI/SAS endurance + grown defect list ----
if d.WearPctRemaining == nil {
if pu := jInt(j, "scsi_percentage_used_endurance_indicator"); pu != nil {
d.WearPctConsumed = pu
d.WearPctRemaining = pInt(100 - *pu)
d.WearSrc = "SCSI/endurance"
}
}
if grown := jInt(j, "scsi_grown_defect_list"); grown != nil && d.Reallocated == nil {
d.Reallocated = grown
}
// SAS drives have no ATA attribute table; their hard-defect signals live in
// the SCSI logs. Map them onto the fields the scorer already grades: total
// uncorrected read/write/verify errors -> uncorrectable sectors, and the
// pending (to-be-reassigned) defect count -> current pending sectors.
if d.Uncorrectable == nil {
if ec := jObj(j, "scsi_error_counter_log"); ec != nil {
sum, any := 0, false
for _, op := range []string{"read", "write", "verify"} {
if u := jInt(ec, op, "total_uncorrected_errors"); u != nil {
any = true
sum += *u
}
}
if any {
d.Uncorrectable = pInt(sum)
}
}
}
if d.Pending == nil {
if pd := jInt(j, "scsi_pending_defects", "count"); pd != nil {
d.Pending = pd
}
}
// ---- NVMe SMART/Health log ----
if nv := jObj(j, "nvme_smart_health_information_log"); nv != nil {
d.NvmeCriticalWarning = jInt(nv, "critical_warning")
d.NvmeAvailSpare = jInt(nv, "available_spare")
d.NvmeAvailSpareThresh = jInt(nv, "available_spare_threshold")
d.NvmeMediaErrors = jInt(nv, "media_errors")
if d.PowerOnHours == nil {
d.PowerOnHours = jInt(nv, "power_on_hours")
}
if d.PowerCycleCount == nil {
d.PowerCycleCount = jInt(nv, "power_cycles")
}
if d.TempC == nil {
d.TempC = jInt(nv, "temperature")
}
if pu := jInt(nv, "percentage_used"); pu != nil {
d.WearPctConsumed = pu
d.WearPctRemaining = pInt(100 - *pu)
d.WearSrc = "NVMe/percentage_used"
}
// Treat NVMe media+data integrity errors like uncorrectable sectors.
if d.Uncorrectable == nil && d.NvmeMediaErrors != nil {
d.Uncorrectable = d.NvmeMediaErrors
}
}
// Capture smartctl passthrough diagnostics.
if msgs, ok := jLeaf(j, "smartctl", "messages").([]interface{}); ok {
var parts []string
for _, mm := range msgs {
if mo, ok := mm.(map[string]interface{}); ok {
if s := jStr(mo, "string"); s != "" {
parts = append(parts, s)
}
}
}
d.SmartctlMessages = strings.Join(parts, "; ")
}
}
// jsonUsable reports whether the parsed object carries real identity + health.
func jsonUsable(j map[string]interface{}) bool {
if j == nil {
return false
}
hasID := jStr(j, "model_name") != "" || jStr(j, "scsi_model_name") != ""
if !hasID {
return false
}
if jObj(j, "ata_smart_attributes") != nil ||
jObj(j, "smart_status") != nil ||
jObj(j, "nvme_smart_health_information_log") != nil ||
jInt(j, "scsi_percentage_used_endurance_indicator") != nil {
return true
}
return false
}
// first returns the first non-empty string in vals, or "".
func first(vals ...string) string {
for _, v := range vals {
if v != "" {
return v
}
}
return ""
}