Node exporter

Node exporter

Overview Installation Recording rules Dashboards Alerting rules Grafana Cloud Integration

On this page:

This quickstart includes the following alerting rules:

  • NodeFilesystemSpaceFillingUp

    Filesystem on device at instace has only X available space left and is filling up.

  • NodeFilesystemAlmostOutOfSpace

    Filesystem on device at instance has only X available space left.

  • NodeFilesystemFilesFillingUp

    Filesystem on device at instance has only X available inodes left and is filling up.

  • NodeFilesystemAlmostOutOfFiles

    Filesystem on device at instance has only X available inodes left

  • NodeNetworkReceiveErrs

    Instance interface device has encountered X receive errors in the last two minutes.

  • NodeHighNumberConntrackEntriesUsed

    % of conntrack entries are used.

  • NodeTextFileCollectorScrapeError

    Node Exporter text file collector failed to scrape.

  • NodeClockSkewDetected

    Clock on instance is out of sync by more than 300s. Ensure NTP is configured correctly on this host.

  • NodeClockNotSynchronising

    Clock on instance is not synchronising. Ensure NTP is configured on this host.

  • NodeRAIDDegraded

    RAID array ‘device’ on instance is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.

  • NodeRAIDDiskFailure

    At least one device in RAID array on instance failed. Array ‘device’ needs attention and possibly a disk swap.

Download the following alerting rules YAML file
yaml
"groups":
- "name": "node-exporter"
  "rules":
  - "alert": "NodeFilesystemSpaceFillingUp"
    "annotations":
      "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up."
      "summary": "Filesystem is predicted to run out of space within the next 24 hours."
    "expr": |
      (
        node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 40
      and
        predict_linear(node_filesystem_avail_bytes{job="node",fstype!=""}[6h], 24*60*60) < 0
      and
        node_filesystem_readonly{job="node",fstype!=""} == 0
      )
    "for": "1h"
    "labels":
      "severity": "warning"
  - "alert": "NodeFilesystemSpaceFillingUp"
    "annotations":
      "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up fast."
      "summary": "Filesystem is predicted to run out of space within the next 4 hours."
    "expr": |
      (
        node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 20
      and
        predict_linear(node_filesystem_avail_bytes{job="node",fstype!=""}[6h], 4*60*60) < 0
      and
        node_filesystem_readonly{job="node",fstype!=""} == 0
      )
    "for": "1h"
    "labels":
      "severity": "critical"
  - "alert": "NodeFilesystemAlmostOutOfSpace"
    "annotations":
      "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left."
      "summary": "Filesystem has less than 5% space left."
    "expr": |
      (
        node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5
      and
        node_filesystem_readonly{job="node",fstype!=""} == 0
      )
    "for": "1h"
    "labels":
      "severity": "warning"
  - "alert": "NodeFilesystemAlmostOutOfSpace"
    "annotations":
      "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left."
      "summary": "Filesystem has less than 3% space left."
    "expr": |
      (
        node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3
      and
        node_filesystem_readonly{job="node",fstype!=""} == 0
      )
    "for": "1h"
    "labels":
      "severity": "critical"
  - "alert": "NodeFilesystemFilesFillingUp"
    "annotations":
      "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up."
      "summary": "Filesystem is predicted to run out of inodes within the next 24 hours."
    "expr": |
      (
        node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40
      and
        predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0
      and
        node_filesystem_readonly{job="node",fstype!=""} == 0
      )
    "for": "1h"
    "labels":
      "severity": "warning"
  - "alert": "NodeFilesystemFilesFillingUp"
    "annotations":
      "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up fast."
      "summary": "Filesystem is predicted to run out of inodes within the next 4 hours."
    "expr": |
      (
        node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20
      and
        predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0
      and
        node_filesystem_readonly{job="node",fstype!=""} == 0
      )
    "for": "1h"
    "labels":
      "severity": "critical"
  - "alert": "NodeFilesystemAlmostOutOfFiles"
    "annotations":
      "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left."
      "summary": "Filesystem has less than 5% inodes left."
    "expr": |
      (
        node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5
      and
        node_filesystem_readonly{job="node",fstype!=""} == 0
      )
    "for": "1h"
    "labels":
      "severity": "warning"
  - "alert": "NodeFilesystemAlmostOutOfFiles"
    "annotations":
      "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left."
      "summary": "Filesystem has less than 3% inodes left."
    "expr": |
      (
        node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3
      and
        node_filesystem_readonly{job="node",fstype!=""} == 0
      )
    "for": "1h"
    "labels":
      "severity": "critical"
  - "alert": "NodeNetworkReceiveErrs"
    "annotations":
      "description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes."
      "summary": "Network interface is reporting many receive errors."
    "expr": |
      rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
    "for": "1h"
    "labels":
      "severity": "warning"
  - "alert": "NodeNetworkTransmitErrs"
    "annotations":
      "description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes."
      "summary": "Network interface is reporting many transmit errors."
    "expr": |
      rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
    "for": "1h"
    "labels":
      "severity": "warning"
  - "alert": "NodeHighNumberConntrackEntriesUsed"
    "annotations":
      "description": "{{ $value | humanizePercentage }} of conntrack entries are used."
      "summary": "Number of conntrack are getting close to the limit."
    "expr": |
      (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
    "labels":
      "severity": "warning"
  - "alert": "NodeTextFileCollectorScrapeError"
    "annotations":
      "description": "Node Exporter text file collector failed to scrape."
      "summary": "Node Exporter text file collector failed to scrape."
    "expr": |
      node_textfile_scrape_error{job="node"} == 1
    "labels":
      "severity": "warning"
  - "alert": "NodeClockSkewDetected"
    "annotations":
      "description": "Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host."
      "summary": "Clock skew detected."
    "expr": |
      (
        node_timex_offset_seconds > 0.05
      and
        deriv(node_timex_offset_seconds[5m]) >= 0
      )
      or
      (
        node_timex_offset_seconds < -0.05
      and
        deriv(node_timex_offset_seconds[5m]) <= 0
      )
    "for": "10m"
    "labels":
      "severity": "warning"
  - "alert": "NodeClockNotSynchronising"
    "annotations":
      "description": "Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host."
      "summary": "Clock not synchronising."
    "expr": |
      min_over_time(node_timex_sync_status[5m]) == 0
      and
      node_timex_maxerror_seconds >= 16
    "for": "10m"
    "labels":
      "severity": "warning"
  - "alert": "NodeRAIDDegraded"
    "annotations":
      "description": "RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically."
      "summary": "RAID Array is degraded"
    "expr": |
      node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
    "for": "15m"
    "labels":
      "severity": "critical"
  - "alert": "NodeRAIDDiskFailure"
    "annotations":
      "description": "At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap."
      "summary": "Failed device in RAID array"
    "expr": |
      node_md_disks{state="failed"} > 0
    "labels":
      "severity": "warning"

This alerting rule YAML file was generated using the Node-exporter mixin. It uses the job=node label selector to query metrics by default. If you need to use a different selector, modify the selector in config.libsonnet and regenerate the dashboard following the instructions in the mixin repository.