package main import ( "fmt" "strings" ) // scoreDrive computes a drive's risk score, recommendation, and the reason // string behind them. The scoring rules are deliberate: // // - Only real, drive-attributable defects add meaningful score. // - Missing or unreadable data is never treated as a failure (no points). // - Wear and age are graded to nudge toward planned replacement. // // The score maps to a recommendation: // // >= 100 -> REPLACE_NOW (hard defect: drive is failing or failed) // >= 50 -> REPLACE_SOON (serious wear or accumulating defects) // >= 20 -> MONITOR (early warning signs) // < 20 -> OK // // A drive with no SMART data and no controller red flags scores NO_DATA, meaning // re-collect rather than replace. func scoreDrive(d *Drive) (int, string, string) { score := 0 var reasons []string add := func(pts int, msg string) { score += pts reasons = append(reasons, msg) } min := func(a, b int) int { if a < b { return a } return b } // ---- Hard physical defects (drive-attributable) ---- if realloc := iv(d.Reallocated); realloc > 0 { pts := min(40+realloc*5, 100) add(pts, fmt.Sprintf("reallocated=%d(+%d)", realloc, pts)) } if pending := iv(d.Pending); pending > 0 { pts := min(50+pending*5, 100) add(pts, fmt.Sprintf("pending=%d(+%d)", pending, pts)) } if uncorr := iv(d.Uncorrectable); uncorr > 0 { pts := min(60+uncorr*5, 100) add(pts, fmt.Sprintf("uncorrectable=%d(+%d)", uncorr, pts)) } if reported := iv(d.ReportedUncorrect); reported > 0 { pts := min(reported*10, 60) add(pts, fmt.Sprintf("reported_uncorrect=%d(+%d)", reported, pts)) } if e2e := iv(d.EndToEnd); e2e > 0 { pts := min(e2e*20, 80) add(pts, fmt.Sprintf("end_to_end_err=%d(+%d)", e2e, pts)) } if badblk := iv(d.RuntimeBadBlocks); badblk > 0 { pts := min(badblk*5, 40) add(pts, fmt.Sprintf("runtime_bad_blocks=%d(+%d)", badblk, pts)) } // ---- SMART self-assessment: only penalize an EXPLICIT failure ---- if d.SmartHealth == "FAILED" { add(100, "SMART_health=FAILED(+100)") } // ---- SATA link quality (cabling/backplane, not the NAND) ---- if crc := iv(d.UdmaCrc); crc > 0 { pts := min(crc*3, 25) add(pts, fmt.Sprintf("udma_crc=%d(+%d)", crc, pts)) } // ---- RAID-controller signals ---- if d.PredictiveFailureCtrl > 0 { add(70, "ctrl_predictive_failure(+70)") } if d.SmartAlertCtrl { add(50, "ctrl_smart_alert(+50)") } // Penalize a controller-reported state only when it is not a healthy one. // MegaCLI spells these out ("Online, Spun Up", "Hotspare") while storcli // abbreviates ("Onln", "GHS"/"DHS" for hot spares, "JBOD"); all are fine. fw := strings.ToLower(d.FwState) fwHealthy := fw == "" || strings.Contains(fw, "online") || strings.Contains(fw, "onln") || strings.Contains(fw, "hotspare") || strings.Contains(fw, "ghs") || strings.Contains(fw, "dhs") || strings.Contains(fw, "jbod") if !fwHealthy { add(40, fmt.Sprintf("fw_state=%s(+40)", d.FwState)) } // MegaCLI/storcli media errors: soft signal, graded gently and capped. me := d.MediaErrCtrl switch { case me >= 100: add(30, fmt.Sprintf("ctrl_media_errors=%d(+30)", me)) case me >= 20: add(15, fmt.Sprintf("ctrl_media_errors=%d(+15)", me)) case me > 0: add(5, fmt.Sprintf("ctrl_media_errors=%d(+5)", me)) } // ---- NVMe critical warning bitmask (any bit set is a real alert) ---- if d.NvmeCriticalWarning != nil && *d.NvmeCriticalWarning > 0 { add(60, fmt.Sprintf("nvme_critical_warning=0x%02x(+60)", *d.NvmeCriticalWarning)) } // NVMe spare below threshold -> reserve exhaustion. if d.NvmeAvailSpare != nil && d.NvmeAvailSpareThresh != nil && *d.NvmeAvailSpare <= *d.NvmeAvailSpareThresh { add(40, fmt.Sprintf("nvme_avail_spare<=thresh(%d<=%d)(+40)", *d.NvmeAvailSpare, *d.NvmeAvailSpareThresh)) } // ---- Wear (graded; only meaningful with real SMART data) ---- if d.WearPctConsumed != nil { wc := *d.WearPctConsumed switch { case wc >= 95: add(80, fmt.Sprintf("wear_consumed=%d%%(+80)", wc)) case wc >= 90: add(55, fmt.Sprintf("wear_consumed=%d%%(+55)", wc)) case wc >= 80: add(30, fmt.Sprintf("wear_consumed=%d%%(+30)", wc)) case wc >= 70: add(15, fmt.Sprintf("wear_consumed=%d%%(+15)", wc)) case wc >= 60: add(8, fmt.Sprintf("wear_consumed=%d%%(+8)", wc)) } } // ---- Reserve-block exhaustion (Micron ID180 VALUE -> remaining %) ---- if d.UnusedReservePct != nil && *d.UnusedReservePct <= 10 { add(30, fmt.Sprintf("reserve_blocks_low(val=%d)(+30)", *d.UnusedReservePct)) } // ---- Age (gentle nudge only) ---- hours := iv(d.PowerOnHours) switch { case hours >= 61320: // Older than 7 years. add(15, fmt.Sprintf("age=%dh(+15)", hours)) case hours >= 52560: // Older than 6 years. add(8, fmt.Sprintf("age=%dh(+8)", hours)) case hours >= 43800: // Older than 5 years. add(4, fmt.Sprintf("age=%dh(+4)", hours)) } // ---- Decide recommendation ---- // NO_DATA only when nothing observed the drive: no SMART, no controller error // signals, and no controller state. A controller-only drive (e.g. NVMe behind // a PERC) reports a FwState, so it is scored on controller evidence instead. if !d.HaveSmart && me == 0 && d.PredictiveFailureCtrl == 0 && !d.SmartAlertCtrl && d.FwState == "" { return 0, "NO_DATA", "smartctl returned no usable SMART data; re-collect" } var rec string switch { case score >= 100: rec = "REPLACE_NOW" case score >= 50: rec = "REPLACE_SOON" case score >= 20: rec = "MONITOR" default: rec = "OK" } if len(reasons) == 0 { return score, rec, "no defects detected" } return score, rec, strings.Join(reasons, "; ") }