Alerts

node_cpu_usage (2 active)
alert: node_cpu_usage
expr: 100
  - (avg by(node_name) (irate(node_cpu_seconds_total{mode="idle"}[1m]) * on(instance)
  group_left(node_name) node_meta * 100)) > 50
for: 1m
labels:
  severity: warning
annotations:
  description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize $value}}%.
  summary: CPU alert for Swarm node '{{ $labels.node_name }}'
Labels State Active Since Value
alertname="node_cpu_usage" node_name="ip-172-31-35-205" severity="warning" firing 2024-05-19 07:29:13.230555093 +0000 UTC 67.23333333386108
alertname="node_cpu_usage" node_name="ip-172-31-12-99" severity="warning" firing 2024-05-19 07:31:13.230555093 +0000 UTC 92.26666666722546
node_disk_usage (1 active)
alert: node_disk_usage
expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"}
  - node_filesystem_free_bytes{mountpoint="/rootfs"}) * 100 / node_filesystem_size_bytes{mountpoint="/rootfs"})
  * on(instance) group_left(node_name) node_meta > 85
for: 1m
labels:
  severity: warning
annotations:
  description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize $value}}%.
  summary: Disk alert for Swarm node '{{ $labels.node_name }}'
Labels State Active Since Value
alertname="node_disk_usage" device="/dev/root" fstype="ext4" instance="10.0.13.31:9100" job="node-exporter" mountpoint="/rootfs" node_name="ip-172-31-14-59" severity="warning" firing 2024-05-14 06:46:43.230555093 +0000 UTC 86.82064901677887
node_disk_fill_rate_6h (0 active)
alert: node_disk_fill_rate_6h
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h],
  6 * 3600) * on(instance) group_left(node_name) node_meta < 0
for: 1h
labels:
  severity: critical
annotations:
  description: Swarm node {{ $labels.node_name }} disk is going to fill up in 6h.
  summary: Disk fill alert for Swarm node '{{ $labels.node_name }}'
node_memory_usage (0 active)
alert: node_memory_usage
expr: sum
  by(node_name) (((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) /
  node_memory_MemTotal_bytes) * on(instance) group_left(node_name) node_meta * 100)
  > 80
for: 1m
labels:
  severity: warning
annotations:
  description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize $value}}%.
  summary: Memory alert for Swarm node '{{ $labels.node_name }}'
task_high_cpu_usage_50 (0 active)
alert: task_high_cpu_usage_50
expr: sum
  by(container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)
  (rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[1m]))
  * 100 > 50
for: 1m
annotations:
  description: '{{ $labels.container_label_com_docker_swarm_task_name }} on '{{
    $labels.container_label_com_docker_swarm_node_id }}' CPU usage is at {{ humanize
    $value}}%.'
  summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
    }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'
task_high_memory_usage_1g (0 active)
alert: task_high_memory_usage_1g
expr: sum
  by(container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)
  (container_memory_rss{container_label_com_docker_swarm_task_name=~".+"})
  > 1.5e+09
for: 1m
annotations:
  description: '{{ $labels.container_label_com_docker_swarm_task_name }} on '{{
    $labels.container_label_com_docker_swarm_node_id }}' memory usage is {{ humanize
    $value}}.'
  summary: Memory alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
    }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'