[TOC] 1、cpu_over.yml 1 2 3 4 5 6 7 8 9 10 11 groups: - name: CPU报警规则 rules: - alert: CPU使用率告警 expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 90 for: 1m labels: user: prometheus severity: warning annotations: description: "服务器: CPU使用超过90%!(当前值: {{ $value }}%)" 2、memory_over.yml 1 2 3 4 5 6 7 8 9 10 11 groups: - name: 内存报警规则 rules: - alert: 内存使用率告警 expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80 for: 1m labels: user: prometheus severity: warning annotations: description: "服务器: 内存使用超过80%!(当前值: {{ $value }}%)" 3、node_down.yml 1 2 3 4 5 6 7 8 9 10 11 groups: - name: 实例存活告警规则 rules: - alert: 实例存活告警 expr: up == 0 for: 1m labels: user: prometheus severity: warning annotations: description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."