admin管理员组文章数量:1122847
Prometheus
1、监控主机指标
这是一篇介绍主机使用Prometheus监控CPU、磁盘、内存、负载等基础数据的文章,目前生产可用,使用的是node_exporter-0.18.1版本,操作系统是centos7.X版本,使用之前请修改job="gt-dwz-node-exporter"的值对应自己在Prometheus配置的job名称。
2、Prometheus配置项
在prometheus.yml配置文件中添加如下配置:
############gt-dwz#################- job_name: "gt-dwz-node-exporter"static_configs:- targets: ['10.1.5.123:9100','10.1.5.124:9100','10.1.5.125:9100','10.1.5.126:9100']labels:service: gt-dwz-monitor
3、PromQL判断rules文件
[root@gtcq-gt-monitor-prometheus-01 rules]# more gt-dwz-monitor.rules
groups:
- name: dwz-gt-monitorrules:- alert: "node-Agent告警"expr: up{job="gt-dwz-node-exporter"} == 0for: 120slabels:severity: "重要"team: dwz-gt-monitoralert_type: "Agent告警" alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"annotations:summary: "{{ $labels.instance }} 已停止采集监控数据 30s!"description: "{{ $labels.instance }} job {{ $labels.job }} 暴露监控数据已停止."- alert: "CPU使用率监控"expr: ceil(100 - sum(increase(node_cpu_seconds_total{job="gt-dwz-node-exporter",mode="idle"}[5m])) by(instance) / sum(increase(node_cpu_seconds_total{job="gt-dwz-node-exporter"}[5m])) by(instance)*100) > 80for: 2mlabels:severity: "重要"team: bdfbalert_type: "CPU告警"alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"annotations:summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} CPU使用率过高"description: "IP:{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}的CPU使用大于80% (当前值: {{ $value }})"- alert: "磁盘使用率监控"expr: round((1 - (node_filesystem_avail_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter"} / node_filesystem_size_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter"
})) * 100) > 80for: 2mlabels:severity: "重要"team: dwz-gt-monitoralert_type: "Disk告警"alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"annotations:summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}的{{ $labels.mountpoint }} 分区使用大于80% (当前值: {{ $value }}%)"- alert: "内存使用率监控"expr: ceil( (1 - (node_memory_MemAvailable_bytes{job="gt-dwz-node-exporter"} / (node_memory_MemTotal_bytes{job="gt-dwz-node-exporter"})))* 100 ) > 80for: 2mlabels:severity: "重要"team: dwz-gt-monitoralert_type: "MEM告警"alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"annotations:summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}内存使用率过高"description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}内存使用大于80% (当前值: {{ $value }})"- alert: "服务器大法宝CPULoad5"expr: node_load5{job="gt-dwz-node-exporter"} > 100for: 2mlabels:severity: "重要"team: dwz-gt-monitoralert_type: "负载告警"alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"annotations:summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}CPU负载过高"description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} CPU负载load大于100 (当前值: {{ $value }})"- alert: "服务器文件句柄监控"expr: node_filefd_allocated{job="gt-dwz-node-exporter"} > 50000for: 2mlabels:severity: "重要"team: dwz-gt-monitoralert_type: "文件句柄告警"alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"annotations:summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 文件句柄使用过高"description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 文件句柄使用过高大于50000 (当前值: {{ $value }})"- alert: "服务器TCP连接数监控"expr: node_sockstat_TCP_tw{job="gt-dwz-node-exporter"} > 15000for: 2mlabels:severity: "重要"team: dwz-gt-monitoralert_type: "TCP连接数告警"alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"annotations:summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 等待关闭的TCP连接数过高"description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 等待关闭的TCP连接数TIME_WAIT过高大于15000 (当前值: {{ $value }})"- alert: "服务器入口流量监控"expr: round((sum by (instance) (irate(node_network_receive_bytes_total{job="gt-dwz-node-exporter",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])))/1024/1024) > 50for: 2mlabels:severity: "重要"team: dwz-gt-monitoralert_type: "流量告警"alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"annotations:summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}监控入口流量过高"description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控入口流量过高过高大于50MB (告警值: {{ $value }}MB)"- alert: "服务器出口流量监控"expr: round((sum by (instance) (irate(node_network_transmit_bytes_total{job="gt-dwz-node-exporter",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])))/1024/1024) > 50for: 2mlabels:severity: "重要"team: dwz-gt-monitoralert_type: "流量告警"alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"annotations:summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控出口流量过高"description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控出口流量过高过高大于50MB (告警值: {{ $value }}MB)"
[root@gtcq-gt-monitor-prometheus-01 rules]#
4、测试告警
修改磁盘阈值如下:
- alert: "磁盘使用率监控"expr: round((1 - (node_filesystem_avail_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter"} / node_filesystem_size_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter"})) * 100) > 10for: 2mlabels:severity: "重要"team: dwz-gt-monitoralert_type: "Disk告警"alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"annotations:summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}的{{ $labels.mountpoint }} 分区使用大于10% (当前值: {{ $value }}%)"
本文标签: Prometheus
版权声明:本文标题:Prometheus 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1687807223a143824.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论