smartmontools - Festplatten-Gesundheit überwachen | Blog

smartmontools ist das Standard-Werkzeug zur Überwachung von Festplatten und SSDs. Es liest S.M.A.R.T.-Daten aus und warnt vor drohenden Ausfällen.

Installation

# Debian/Ubuntu
apt install smartmontools

# RHEL/CentOS
dnf install smartmontools

# Service aktivieren
systemctl enable smartd
systemctl start smartd

Grundlagen

Disk-Informationen

# S.M.A.R.T. Unterstützung prüfen
smartctl -i /dev/sda

# Ausgabe:
# Model Family:     Seagate Barracuda
# Device Model:     ST2000DM008-2FR102
# Serial Number:    ZFL1234X
# Firmware Version: 0001
# SMART support is: Available
# SMART support is: Enabled

S.M.A.R.T. aktivieren

# S.M.A.R.T. einschalten
smartctl -s on /dev/sda

# Status prüfen
smartctl -c /dev/sda

Gesundheits-Check

Schneller Check

# Health-Status
smartctl -H /dev/sda

# Ausgabe bei OK:
# SMART overall-health self-assessment test result: PASSED

# Bei Problemen:
# SMART overall-health self-assessment test result: FAILED!

Alle Attribute

# Alle S.M.A.R.T. Attribute
smartctl -A /dev/sda

# Ausgabe:
# ID# ATTRIBUTE_NAME          FLAG     VALUE WORST THRESH TYPE      UPDATED  WHEN_FAILED RAW_VALUE
#   1 Raw_Read_Error_Rate     0x000f   118   099   006    Pre-fail  Always       -       193456784
#   5 Reallocated_Sector_Ct   0x0033   100   100   010    Pre-fail  Always       -       0
#   9 Power_On_Hours          0x0032   089   089   000    Old_age   Always       -       9876
# 194 Temperature_Celsius     0x0022   031   040   000    Old_age   Always       -       31
# 197 Current_Pending_Sector  0x0012   100   100   000    Old_age   Always       -       0
# 198 Offline_Uncorrectable   0x0010   100   100   000    Old_age   Offline      -       0

Vollständiger Report

# Alle Informationen
smartctl -a /dev/sda

# Mit Vendor-spezifischen Daten
smartctl -x /dev/sda

Wichtige Attribute

ID	Attribut	Bedeutung
5	Reallocated_Sector_Ct	Umgemappte Sektoren (kritisch!)
9	Power_On_Hours	Betriebsstunden
187	Reported_Uncorrectable	Nicht korrigierbare Fehler
188	Command_Timeout	Timeout-Fehler
194	Temperature_Celsius	Temperatur
197	Current_Pending_Sector	Ausstehende Sektoren
198	Offline_Uncorrectable	Offline nicht korrigierbar

Selbsttests

Test-Typen

# Kurzer Test (1-2 Minuten)
smartctl -t short /dev/sda

# Langer Test (Stunden)
smartctl -t long /dev/sda

# Conveyance Test (Transport)
smartctl -t conveyance /dev/sda

# Selektiver Test
smartctl -t select,0-1000 /dev/sda

Test-Status

# Laufende Tests prüfen
smartctl -c /dev/sda

# Test-Ergebnisse
smartctl -l selftest /dev/sda

# Ausgabe:
# Num  Test_Description    Status                  Remaining  LifeTime
# # 1  Short offline       Completed without error       00%     12345
# # 2  Extended offline    Completed without error       00%     12300

Test abbrechen

smartctl -X /dev/sda

Error-Logs

# Error-Log anzeigen
smartctl -l error /dev/sda

# Bei Fehlern:
# Error 1 occurred at disk power-on lifetime: 12345 hours
# When the command that caused the error occurred, the device was active or idle.
# After command completion occurred, registers were:
# ER ST SC SN CL CH DH
# 40 51 08 00 00 00 e0  Error: UNC 8 sectors at LBA = 0x00000000

NVMe-SSDs

# NVMe-Info
smartctl -a /dev/nvme0

# Oder
nvme smart-log /dev/nvme0

# NVMe-spezifische Attribute:
# Critical Warning:                   0x00
# Temperature:                        34 Celsius
# Available Spare:                    100%
# Available Spare Threshold:          10%
# Percentage Used:                    1%
# Data Units Read:                    12345678
# Data Units Written:                 98765432
# Power Cycles:                       100
# Power On Hours:                     5000

smartd - Daemon

Konfiguration

# /etc/smartd.conf

# Alle Disks überwachen
DEVICESCAN -a -o on -S on -n standby,q -s (S/../.././02|L/../../6/03) -m admin@example.de

# Einzelne Disk
/dev/sda -a -o on -S on -s (S/../.././02|L/../../6/03) -m admin@example.de -M exec /usr/local/bin/smart-notify.sh

# NVMe
/dev/nvme0 -a -o on -S on -m admin@example.de

Optionen

Option	Beschreibung
-a	Alle Attribute prüfen
-o on	Offline Tests aktivieren
-S on	Attribut-Autosave
-n standby	Disk nicht aufwecken
-s	Test-Zeitplan
-m	Mail bei Fehlern
-M exec	Script ausführen

Test-Zeitplan

# Format: (T/MM/DD/d/HH)
# T = Test-Typ (S=short, L=long, C=conveyance, O=offline)
# MM = Monat (01-12, .. = alle)
# DD = Tag (01-31, .. = alle)
# d = Wochentag (1=Mo, 7=So, . = alle)
# HH = Stunde (00-23)

# Short-Test täglich um 02:00
-s S/../.././02

# Long-Test jeden Samstag um 03:00
-s L/../../6/03

# Kombiniert
-s (S/../.././02|L/../../6/03)

Service neu starten

systemctl restart smartd
systemctl status smartd

Benachrichtigungen

Mail-Konfiguration

# In /etc/smartd.conf
-M exec /usr/share/smartmontools/smartd-runner

# Oder eigenes Script
-M exec /usr/local/bin/smart-alert.sh

Alert-Script

#!/bin/bash
# /usr/local/bin/smart-alert.sh

# Variablen von smartd
# $SMARTD_DEVICE - Gerät
# $SMARTD_DEVICETYPE - Typ
# $SMARTD_FAILTYPE - Fehlertyp
# $SMARTD_MESSAGE - Nachricht

echo "SMART Alert: $SMARTD_DEVICE" | mail -s "SMART Warning: $SMARTD_FAILTYPE" admin@example.de

# Oder Webhook
curl -X POST https://alerts.example.de/smart \
    -d "device=$SMARTD_DEVICE&message=$SMARTD_MESSAGE"

RAID-Controller

MegaRAID

# Hinter MegaRAID
smartctl -a -d megaraid,0 /dev/sda
smartctl -a -d megaraid,1 /dev/sda

3ware

# 3ware Controller
smartctl -a -d 3ware,0 /dev/twa0

HP CCISS

# HP Smart Array
smartctl -a -d cciss,0 /dev/cciss/c0d0

Monitoring-Script

#!/bin/bash
# disk-health-check.sh

DISKS="/dev/sda /dev/sdb /dev/nvme0"
ALERT_TEMP=50
ALERT_REALLOCATED=10

for disk in $DISKS; do
    if [ ! -b "$disk" ]; then
        continue
    fi

    echo "=== $disk ==="

    # Health-Check
    health=$(smartctl -H $disk | grep "SMART overall-health" | awk '{print $NF}')
    echo "Health: $health"

    if [ "$health" != "PASSED" ]; then
        echo "WARNING: Disk health check failed!"
    fi

    # Temperatur
    temp=$(smartctl -A $disk | grep Temperature_Celsius | awk '{print $10}')
    if [ -n "$temp" ]; then
        echo "Temperature: ${temp}°C"
        if [ "$temp" -gt "$ALERT_TEMP" ]; then
            echo "WARNING: Temperature above threshold!"
        fi
    fi

    # Reallocated Sectors
    reallocated=$(smartctl -A $disk | grep Reallocated_Sector_Ct | awk '{print $10}')
    if [ -n "$reallocated" ]; then
        echo "Reallocated Sectors: $reallocated"
        if [ "$reallocated" -gt "$ALERT_REALLOCATED" ]; then
            echo "WARNING: High reallocated sector count!"
        fi
    fi

    # Power-On Hours
    hours=$(smartctl -A $disk | grep Power_On_Hours | awk '{print $10}')
    if [ -n "$hours" ]; then
        echo "Power-On Hours: $hours"
    fi

    echo ""
done

Prometheus-Export

# smartctl_exporter
docker run -d \
    --name smartctl-exporter \
    --privileged \
    -v /dev:/dev:ro \
    -p 9633:9633 \
    prometheuscommunity/smartctl-exporter

# Metriken: http://localhost:9633/metrics

Zusammenfassung

Befehl	Funktion
smartctl -H	Health-Check
smartctl -A	Alle Attribute
smartctl -a	Vollständiger Report
smartctl -t short	Kurzer Test
smartctl -t long	Langer Test
smartctl -l error	Error-Log
smartctl -l selftest	Test-Ergebnisse

Kritische Attribute	Bedeutung
Reallocated_Sector_Ct	Defekte Sektoren
Current_Pending_Sector	Ausstehende Sektoren
Offline_Uncorrectable	Nicht korrigierbar
Reported_Uncorrectable	Gemeldete Fehler

Schwellwert	Aktion
Reallocated > 0	Beobachten
Reallocated > 100	Disk ersetzen planen
FAILED Status	Sofort ersetzen

Fazit

smartmontools ist unverzichtbar für Festplatten-Überwachung. Regelmäßige S.M.A.R.T.-Checks erkennen Probleme frühzeitig. Der smartd-Daemon automatisiert die Überwachung. Bei steigenden Reallocated-Sektoren sollte die Disk ersetzt werden. Für produktive Systeme ist ein Monitoring-System mit Alerting empfehlenswert.