采用prometheus方式进行部署,监控本地服务器、网络线路、域名访问达到阈值触发告警,从而快速定为问题源加快响应速度
1、创建所需系统结构目录并给目录授权
2、prometheus部署
1)创建docker-compose文件
vim docker-compose.yml
-------------------------------------包含部署grafana、consul、alertmanager----------------------------------
version: '3.7'
services:
prometheus:
depends_on:
- alertmanager
image: prom/prometheus:latest
restart: always
container_name: prometheus
environment:
- TZ=CST-8
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./rules:/etc/prometheus/rules/
- ./prometheus/data:/prometheus
- /etc/hosts:/etc/hosts
- /etc/localtime:/etc/localtime
ports:
- "9090:9090"
networks:
- prom
alertmanager:
image: prom/alertmanager:latest
restart: always
container_name: alertmanager
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- ./alertmanager/mail.tmpl:/etc/alertmanager/mail.tmpl
- /etc/localtime:/etc/localtime
environment:
TZ: Asia/Shanghai
ports:
- "9093:9093"
- "9094:9094"
networks:
- prom
grafana:
depends_on:
- prometheus
container_name: grafana
image: grafana/grafana:latest
restart: always
volumes:
- ./grafana:/var/lib/grafana
- /etc/localtime:/etc/localtime
ports:
- "3000:3000"
blackbox_exporter:
image: quay.io/prometheus/blackbox-exporter:latest
restart: always
container_name: blackbox_exporter
volumes:
- /etc/localtime:/etc/localtime
- ./blackbox_exporter/blackbox.yml:/config/blackbox.yml
ports:
- "9115:9115"
command: --config.file=/config/blackbox.yml
networks:
- prom
vmware_exporter:
image: pryorda/vmware_exporter
restart: always
container_name: vmware_exporter
env_file:
- ./vm_config/vm_config.env
ports:
- "9273:9272"
networks:
- prom
nginx:
image: nginx:latest
restart: always
container_name: nginx
volumes:
- /usr/share/zoneinfo/PRC:/etc/localtime
- ./nginx/html:/usr/share/nginx/html
- ./nginx/conf.d:/etc/nginx/conf.d
- ./nginx/image:/image
ports:
- "80:80"
networks:
- prom
vmware_exporter2:
image: pryorda/vmware_exporter
restart: always
container_name: vmware_exporter2
env_file:
- ./vm_config/vm2_config.env
ports:
- "9272:9272"
networks:
- prom
vmware_exporter-jk:
image: pryorda/vmware_exporter
restart: always
container_name: vmware_exporter-jk
env_file:
- ./vm_config/vm3_config.env
ports:
- "9274:9272"
networks:
- prom
snmp-AD:
image: prom/snmp-exporter
restart: always
container_name: snmp_exporter-AD
volumes:
- ./snmp_exporter/snmp_exporter-AD/snmp_exporter/generator/snmp.yml:/etc/snmp_exporter/snmp.yml
ports:
- 9116:9116
command: --config.file=/etc/snmp_exporter/snmp.yml
networks:
- prom
snmp-AD-2:
image: prom/snmp-exporter
restart: always
container_name: snmp_exporter-AD-2
ports:
- "9117:9116"
command: --config.file=/etc/snmp_exporter/snmp.yml
networks:
- prom
networks:
prom:
driver: bridge
-------------------------------------------------------------------------------------------------------------------
2)配置prometheus配置文件
vim prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- '/etc/prometheus/rules/ad-alert/*.yml'
- '/etc/prometheus/rules/https-alert/*.yml'
- '/etc/prometheus/rules/https-duration/*.yml'
- '/etc/prometheus/rules/node-alert/*.yml'
alerting:
alertmanagers:
- static_configs:
- targets:
- 'X.X.X.X:9093'
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets:
- 'X.X.X.X:9090'
- job_name: 'vm-exporter'
static_configs:
- targets:
- 'X.X.X.X:9272' #本地物理机监控
- 'X.X.X.X:9273'
- 'X.X.X.X:9274'
- job_name: 'federate' #联邦集群
metrics_path: '/federate'
honor_labels: true
params:
'match[]':
- '{job="prometheus"}'
- '{__name__=~".*"}'
static_configs:
- targets:
- 'X.X.X.X:9090'
- job_name: 'consul-node-exporter' #consul自动注册
metrics_path: /metrics
scheme: http
scrape_interval: 15s
scrape_timeout: 5s
consul_sd_configs:
- server: 'X.X.X.X:8500'
refresh_interval: 30s
services: ['node-exporter']
relabel_configs:
- source_labels: [__meta_consul_tags]
regex: .*,instance=([^,]*).*
target_label: instance
- source_labels: [__meta_consul_service_address]
target_label: 'ipaddress'
- source_labels: [__meta_consul_service_id]
target_label: 'hostname'
- source_labels: [__meta_consul_service_metadata_group]
target_label: 'localhost'
- source_labels: [__meta_consul_service_metadata_environment]
target_label: 'environment'
- source_labels: [__meta_consul_service_metadata_Project]
target_label: 'Project'
- source_labels: [__meta_consul_service]
target_label: 'service'
- job_name: 'snmp' #本地网络设备监控(自定义监控项)
metrics_path: /snmp
params:
module: [sangfor]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: X.X.X.X:9116
static_configs:
- targets:
- '1x.x.x.x' #被监控网络设备 地址
labels:
hostname: AD
group: snmp
scrape_interval: 30s
scrape_timeout: 30s
- job_name: 'snmp-1' #本地网络设备监控
metrics_path: /snmp
params:
module: [if_mib]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: X.X.X.X:9117
static_configs:
- targets:
- 'x.x.x.x' #被监控网络设备 地址
labels:
hostname: AD-2
group: snmp
scrape_interval: 30s
scrape_timeout: 30s
- job_name: 'blackbox' #黑盒监控-域名质量访问监控
metrics_path: /probe
params:
module: [http_2xx] # Look for an HTTP 200 response.
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: X.X.X.X:9115 # The blackbox exporter's real hostname:port.
static_configs:
- targets:
- 'https://www.baidu.com'
- 'https://www.google.com'
- 'https://www.github.com'
- 'https://www.youtube.com'
- 'https://activity.huaweicloud.com'
- 'https://www.aliyun.com'
- 'https://cloud.tencent.com'
- 'https://www.tapd.cn'
- 'https://www.openai.com'
- 'https://www.pinterest.com'
- 'https://www.qq.com'
- 'https://www.bilibili.com'
3、添加告警rules
1)服务器告警:
groups:
- name: node-alert
rules:
-
- alert: NodeDown
expr: up{job="consul-node-exporter"} == 0
for: 5m
labels:
severity: critical
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} down"
description: "主机: {{ $labels.instance }} 已经宕机 5分钟"
value: "{{ $value }}" - alert: NodeCpuHigh
expr: 100 * (1 - avg by(instance) (irate(node_cpu_seconds_total{mode="idle",job="consul-node-exporter"}[5m]))) > 80
for: 1m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} cpu使用率过高"
description: "CPU 使用率超过 80%"
value: "{{ $value }}" - alert: NodeCpuIowaitHigh
expr: avg by (instance) (irate(node_cpu_seconds_total{job="consul-node-exporter",mode="iowait"}[5m])) * 100 > 50
for: 5m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} cpu iowait 使用率过高"
description: "CPU iowait 使用率超过 50%"
value: "{{ $value }}" - alert: NodeLoad5High
expr: node_load5 > (count by (instance) (node_cpu_seconds_total{job="consul-node-exporter",mode='system'})) * 1.2
for: 5m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} load(5m) 过高"
description: "Load(5m) 过高,超出cpu核数 1.2倍"
value: "{{ $value }}" - alert: NodeMemoryHigh
expr: (1 - node_memory_MemAvailable_bytes{job="consul-node-exporter"} / node_memory_MemTotal_bytes{job="consul-node-exporter"}) * 100 > 90
for: 5m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} memory 使用率过高"
description: "Memory 使用率超过 90%"
value: "{{ $value }}" - alert: NodeDiskRootHigh
expr: (1 - node_filesystem_avail_bytes{job="consul-node-exporter",fstype=~"ext.|xfs",mountpoint ="/"} / node_filesystem_size_bytes{job="consul-node-exporter",fstype=~"ext.|xfs",mountpoint ="/"}) * 100 > 90
for: 10m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk(/ 分区) 使用率过高"
description: "Disk(/ 分区) 使用率超过 90%"
value: "{{ $value }}" - alert: NodeDiskBootHigh
expr: (1 - node_filesystem_avail_bytes{job="consul-node-exporter",fstype=~"ext.|xfs",mountpoint ="/boot"} / node_filesystem_size_bytes{job="consul-node-exporter",fstype=~"ext.|xfs",mountpoint ="/boot"}) * 100 > 80
for: 10m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk(/boot 分区) 使用率过高"
description: "Disk(/boot 分区) 使用率超过 80%"
value: "{{ $value }}" - alert: NodeDiskReadHigh
expr: irate(node_disk_read_bytes_total{job="consul-node-exporter"}[5m]) > 20 * (1024 ^ 2)
for: 5m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk 读取字节数 速率过高"
description: "Disk 读取字节数 速率超过 20 MB/s"
value: "{{ $value }}" - alert: NodeDiskWriteHigh
expr: irate(node_disk_written_bytes_total{job="consul-node-exporter"}[5m]) > 20 * (1024 ^ 2)
for: 5m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk 写入字节数 速率过高"
description: "Disk 写入字节数 速率超过 20 MB/s"
value: "{{ $value }}" - alert: NodeDiskReadRateCountHigh
expr: irate(node_disk_reads_completed_total{job="consul-node-exporter"}[5m]) > 3000
for: 5m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk iops 每秒读取速率过高"
description: "Disk iops 每秒读取速率超过 3000 iops"
value: "{{ $value }}" - alert: NodeDiskWriteRateCountHigh
expr: irate(node_disk_writes_completed_total{job="consul-node-exporter"}[5m]) > 3000
for: 5m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk iops 每秒写入速率过高"
description: "Disk iops 每秒写入速率超过 3000 iops"
value: "{{ $value }}" - alert: NodeInodeRootUsedPercentHigh
expr: (1 - node_filesystem_files_free{job="consul-node-exporter",fstype=~"ext4|xfs",mountpoint="/"} / node_filesystem_files{job="consul-node-exporter",fstype=~"ext4|xfs",mountpoint="/"}) * 100 > 80
for: 10m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk(/ 分区) inode 使用率过高"
description: "Disk (/ 分区) inode 使用率超过 80%"
value: "{{ $value }}" - alert: NodeInodeBootUsedPercentHigh
expr: (1 - node_filesystem_files_free{job="consul-node-exporter",fstype=~"ext4|xfs",mountpoint="/boot"} / node_filesystem_files{job="consul-node-exporter",fstype=~"ext4|xfs",mountpoint="/boot"}) * 100 > 80
for: 10m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk(/boot 分区) inode 使用率过高"
description: "Disk (/boot 分区) inode 使用率超过 80%"
value: "{{ $value }}" - alert: NodeFilefdAllocatedPercentHigh
expr: node_filefd_allocated{job="consul-node-exporter"} / node_filefd_maximum{job="consul-node-exporter"} * 100 > 80
for: 10m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} filefd 打开百分比过高"
description: "Filefd 打开百分比 超过 80%"
value: "{{ $value }}" - alert: NodeNetworkNetinBitRateHigh
expr: avg by (instance) (irate(node_network_receive_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]) 8) > 20 (1024 ^ 2) * 8
for: 3m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} network 接收比特数 速率过高"
description: "Network 接收比特数 速率超过 20MB/s"
value: "{{ $value }}" - alert: NodeNetworkNetoutBitRateHigh
expr: avg by (instance) (irate(node_network_transmit_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]) 8) > 20 (1024 ^ 2) * 8
for: 3m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} network 发送比特数 速率过高"
description: "Network 发送比特数 速率超过 20MB/s"
value: "{{ $value }}" - alert: NodeNetworkNetinPacketErrorRateHigh
expr: avg by (instance) (irate(node_network_receive_errs_total{device=~"eth0|eth1|ens33|ens37"}[1m])) > 15
for: 3m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} 接收错误包 速率过高"
description: "Network 接收错误包 速率超过 15个/秒"
value: "{{ $value }}" - alert: NodeNetworkNetoutPacketErrorRateHigh
expr: avg by (instance) (irate(node_network_transmit_packets_total{device=~"eth0|eth1|ens33|ens37"}[1m])) > 15
for: 3m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} 发送错误包 速率过高"
description: "Network 发送错误包 速率超过 15个/秒"
value: "{{ $value }}" - alert: NodeProcessBlockedHigh
expr: node_procs_blocked{job="consul-node-exporter"} > 10
for: 10m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} 当前被阻塞的任务的数量过多"
description: "Process 当前被阻塞的任务的数量超过 10个"
value: "{{ $value }}" - alert: NodeTimeOffsetHigh
expr: abs(node_timex_offset_seconds{job="consul-node-exporter"}) > 3 * 60
for: 2m
labels:
severity: info{{ $labels.instance }} 时间偏差过大"
description: "Time 节点的时间偏差超过 3m"
value: "{{ $value }}"
- alert: NodeDown
2)域名探测延迟告警
groups:
- name: httsp-duration
rules:
-
- alert: DomainAccessDelayExceeds5s
annotations:
description: 域名:{{ $labels.instance }} 探测延迟大于 5 秒,当前延迟为:{{ $value }}
summary: 域名探测,访问延迟超过 5 秒
expr: sum(probe_http_duration_seconds{job=~"blackbox"}) by (instance) > 5
for: 30m
labels:
severity: warning
type: blackbox
- alert: DomainAccessDelayExceeds5s
3)域名连接告警
groups:
- name: https-alert
rules:
-
- alert: https-domain-access
annotations:
description: 域名:{{ $labels.instance }} 失去连接
summary: 域名失去连接5分钟
expr: sum(probe_success{job=~"blackbox"}) by (instance) <= 0
for: 30m
labels:
severity: warning
type: blackbox
- alert: https-domain-access
4)线路状态告警、网络流量线路告警
groups:
- name: 本地网络接口线路掉线告警
rules:
-
- alert: "100M电信CN2海外"
expr: ifOperStatus{group="snmp", hostname="AD-2", ifDescr="eth1", ifIndex="18", ifName="eth1", instance="X.X.X.X", job="snmp-1"} == 2
for: 1s
labels:
severity: critical
annotations:
summary: "100M电信CN2海外线路异常"
description: "AD-eth1接口-100M电信CN2海外线路连接断开,请及时处理!" - alert: "900M联通"
expr: ifOperStatus{group="snmp", hostname="AD-2", ifDescr="eth2", ifIndex="19", ifName="eth2", instance="X.X.X.X", job="snmp-1"} == 2
for: 1s
labels:
severity: critical
annotations:
summary: "900M联通线路异常"
description: "AD-eth2接口-900M联通线路连接断开,请及时处理!" - alert: "1000M腾讯云"
expr: ifOperStatus{group="snmp", hostname="AD-2", ifDescr="eth3", ifIndex="20", ifName="eth3", instance="X.X.X.X", job="snmp-1"} == 2
for: 1s
labels:
severity: critical
annotations:
summary: "1000M腾讯云线路异常"
description: "AD-eth3接口-1000M腾讯云线路连接断开,请及时处理!" - alert: "200M中信"
expr: ifOperStatus{group="snmp", hostname="AD-2", ifDescr="eth4", ifIndex="21", ifName="eth4", instance="X.X.X.X", job="snmp-1"} == 2
for: 1s
labels:
severity: critical
annotations:
summary: "200M中信线路异常"
description: "AD-eth4接口-200M中信线路连接断开,请及时处理!" - alert: "100M专线(IOA)"
expr: ifOperStatus{group="snmp", hostname="AD-2", ifDescr="eth5", ifIndex="22", ifName="eth5", instance="X.X.X.X", job="snmp-1"} == 2
for: 1s
labels:
severity: critical
annotations:
summary: "100M专线(IOA)线路异常"
description: "AD-eth5接口-100M专线(IOA)线路连接断开,请及时处理!" - alert: "AD拨号-1000M电信"
expr: ifOperStatus{group="snmp", hostname="AD-2", ifDescr="eth6", ifIndex="23", ifName="eth6", instance="X.X.X.X", job="snmp-1"} == 2
for: 1s
labels:
severity: critical
annotations:
summary: "AD拨号-1000M电信线路异常"
description: "AD-eth6接口-AD拨号-1000M电信线路连接断开,请及时处理!" - alert: "LAN"
expr: ifOperStatus{group="snmp", hostname="AD-2", ifDescr="eth7", ifIndex="24", ifName="eth7", instance="X.X.X.X", job="snmp-1"} == 2
for: 1s
labels:
severity: critical
annotations:
summary: "LAN线路异常"
description: "AD-eth7接口-LAN线路连接断开,请及时处理!" - alert: "100M中信备线"
expr: ifOperStatus{group="snmp", hostname="AD-2", ifDescr="eth9", ifIndex="26", ifName="eth9", instance="X.X.X.X", job="snmp-1"} == 2
for: 1s
labels:
severity: critical
annotations:
summary: "100M中信备线"
description: "AD-eth9接口-100M中信备线线路连接断开,请及时处理!" - alert: "AD拨号-1000M联通"
expr: ifOperStatus{group="snmp", hostname="AD-2", ifDescr="eth10", ifIndex="27", ifName="eth10", instance="X.X.X.X", job="snmp-1"} == 2
for: 1s
labels:
severity: critical
annotations:
summary: "AD拨号-1000M联通"
description: "AD-eth10接口-AD拨号-1000M联通线路连接断开,请及时处理!" - alert: "200M新加坡"
expr: ifOperStatus{group="snmp", hostname="AD-2", ifDescr="eth11", ifIndex="28", ifName="eth11", instance="X.X.X.X", job="snmp-1"} == 2
for: 1s
labels:
severity: critical
annotations:
summary: "200M新加坡"
description: "AD-eth11接口-200M新加坡线路连接断开,请及时处理!" - alert: "1000M电信-路由2"
expr: ifOperStatus{group="snmp", hostname="AD-2", ifDescr="eth12", ifIndex="29", ifName="eth12", instance="X.X.X.X", job="snmp-1"} == 2
for: 1s
labels:
severity: critical
annotations:
summary: "1000M电信-路由2"
description: "AD-eth11接口-1000M电信-路由2线路连接断开,请及时处理!"
- alert: "100M电信CN2海外"
--------------------------------------------------------------------------------------------------------------------------------------------------------------------
groups:
- name: 本地网络线路流量告警
rules:
-
- alert: 1000M电信-AD拨号上行线路
expr: (irate(ifHCOutOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth6"}[5m])) * 8 > 80e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到下行流量过高"
description: "1000M电信-AD拨号线路上行流量超过线路总流量的80%,请及时处理!" - alert: 1000M电信-AD拨号下行线路
expr: (irate(ifHCInOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth6"}[5m])) * 8 > 800e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到下行流量过高"
description: "1000M电信-AD拨号线路下行流量超过线路总流量的80%,请及时处理!" - alert: 1000M联通-AD拨号下行线路
expr: (irate(ifHCOutOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth10"}[5m])) * 8 > 80e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到上行流量过高"
description: "1000M联通-AD拨号线路上行流量超过线路总流量的80%,请及时处理!" - alert: 1000M联通-AD拨号下行线路
expr: (irate(ifHCInOctets{job="snmp-1",instance="X.X.X.X" , hostname='AD-2',ifName="eth10"}[5m])) * 8 > 800e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到下行流量过高"
description: "1000M联通-AD拨号线路下行流量超过线路总流量的80%,请及时处理!" - alert: 1000M腾讯云上行线路
expr: (irate(ifHCOutOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth3"}[5m])) * 8 > 800e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到上行流量过高"
description: "1000M腾讯云线路下行流量超过线路总流量的80%,请及时处理!" - alert: 1000M腾讯云下行线路
expr: (irate(ifHCInOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth3"}[5m])) * 8 > 800e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到下行流量过高"
description: "1000M腾讯云线路下行流量超过线路总流量的80%,请及时处理!" - alert: 1000M电信-路由上行线路
expr: (irate(ifHCOutOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth12"}[5m])) * 8 > 800e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到上行流量过高"
description: "1000M腾讯云线路下行流量超过线路总流量的80%,请及时处理!" - alert: 1000M电线-路由下行线路
expr: (irate(ifHCInOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth12"}[5m])) * 8 > 800e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到下行流量过高"
description: "1000M腾讯云线路下行流量超过线路总流量的80%,请及时处理!" - alert: 100M-IOA专线上行线路
expr: (irate(ifHCOutOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth5"}[5m])) * 8 > 80e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到上行流量过高"
description: "100M-IOA线路下行流量超过线路总流量的80%,请及时处理!" - alert: 100M-IOA专线下行线路
expr: (irate(ifHCInOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth5"}[5m])) * 8 > 80e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到下行流量过高"
description: "100M-IOA线路下行流量超过线路总流量的80%,请及时处理!" - alert: 100M电信CN2上行线路
expr: (irate(ifHCOutOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth1"}[5m])) * 8 > 80e6
for: 15s
labels:
severity: warning
annotations:
summary: "检测到下行流量过高"
description: "100M电信CN2线路上行流量超过线路总流量的80%,请及时处理!" - alert: 100M电信CN2下行线路
expr: (irate(ifHCInOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth1"}[5m])) * 8 > 80e6
for: 15s
labels:
severity: warning
annotations:
summary: "检测到下行流量过高"
description: "100M电信CN2线路下行流量超过线路总流量的80%,请及时处理!" - alert: 100M中信备线上行线路
expr: (irate(ifHCOutOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth9"}[5m])) * 8 > 80e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到上行流量过高"
description: "100M中信备线线路上行流量超过线路总流量的80%,请及时处理!" - alert: 100M中信备线下行线路
expr: (irate(ifHCInOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth9"}[5m])) * 8 > 80e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到下行流量过高"
description: "100M中信备线线路下行流量超过线路总流量的80%,请及时处理!" - alert: 200M新加坡上行线路
expr: (irate(ifHCOutOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth11"}[5m])) * 8 > 160e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到下行流量过高"
description: "200M新加坡线路下行流量超过线路总流量的80%,请及时处理!" - alert: 200M新加坡下行线路
expr: (irate(ifHCInOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth11"}[5m])) * 8 > 160e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到下行流量过高"
description: "200M新加坡上行流量超过线路总流量的80%,请及时处理!" - alert: 200M中信上行线路
expr: (irate(ifHCOutOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth4"}[5m])) * 8 > 160e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到上行流量过高"
description: "200M中信线路上行流量超过线路总流量的80%,请及时处理!" - alert: 200M中信下行线路
expr: (irate(ifHCInOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth4"}[5m])) * 8 > 160e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到下行流量过高"
description: "200M中信线路下行流量超过线路总流量的80%,请及时处理!" - alert: 900M联通上行线路
expr: (irate(ifHCOutOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth2"}[5m])) * 8 > 80e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到上行流量过高"
description: "900M联通线路上行流量超过线路总流量的80%,请及时处理!" - alert: 900M联通下行线路
expr: (irate(ifHCInOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth2"}[5m])) * 8 > 800e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到下行流量过高"
description: "900M联通线路下行流量超过线路总流量的80%,请及时处理!" - alert: LAN上行线路
expr: (irate(ifHCInOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth7"}[5m])) * 8 > 500e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到上行流量过高"
description: "LAN线路上行流量超过线路总流量的80%,请及时处理!" - alert: LAN下行线路
expr: (irate(ifHCInOctets{job="snmp-1",instance="X.X.X.X", hostname='AD-2',ifName="eth7"}[5m])) * 8 > 4000e6
for: 1m
labels:
severity: warning
annotations:
summary: "检测到下行流量过高"
description: "LAN线路下行流量超过线路总流量的80%,请及时处理!"
- alert: 1000M电信-AD拨号上行线路
4、配置alertmanager
1)配置alertmanager配置文件 vim alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: smtp.163.com:25
smtp_from: X.X.X.X@163.com
smtp_auth_username: X.X.X.X@163.com
smtp_auth_password: X.X.X.X
smtp_require_tls: false
templates:
- /etc/alertmanager/mail.tmpl
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 6h
receiver: 'email'
routes:
- match:
severity: 'critical' #线路掉线告警、宕机告警
receiver: 'webhook-critical'
- match:
severity: 'critical'
receiver: 'email'
- match:
secerity: 'P1,P2' #服务告警
receiver: 'email'
- match:
secerity: 'warning' #线路告警
receiver: 'https-alert'
receivers:
- name: 'email'
email_configs:
-
- to: '[email protected], [email protected], [email protected]'
html: '{{ template "email.to.html" . }}'
send_resolved: true
- to: '[email protected], [email protected], [email protected]'
- name: 'webhook-critical'
webhook_configs:
-
- url: 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=463d19a3-c3df-4345-a1ff-ff6e7c2a869d'
email_configs: - to: '[email protected], [email protected], [email protected]'
html: '{{ template "email.to.html" . }}'
send_resolved: true
- url: 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=463d19a3-c3df-4345-a1ff-ff6e7c2a869d'
- name: 'https-alert'
email_configs:
-
- to: '[email protected], [email protected], [email protected]'
html: '{{ template "email03.to.html" . }}'
send_resolved: true
- to: '[email protected], [email protected], [email protected]'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'cluster', 'service'] - source_match:
severity: 'warning'
target_match:
severity: 'info'
2)配置告警模板 vimmail.tmpl (网络线路告警和机器告警不使用同一个告警模板)
{{ define "email.to.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }}
<h2 style="color:red">==========异常告警==========</h2>
告警程序: prometheus_alert <br>
告警级别: {{ .Labels.severity }} 级 <br>
告警类型: {{ .Labels.alertname }} <br>
故障主机: {{ .Labels.instance }} <br>
告警主题: {{ .Annotations.summary }} <br>
告警详情: {{ .Annotations.description }} <br>
触发时间: {{ .StartsAt.Local.Format "2006-01-02 15:04:05" }} <br>
{{ end }}{{ end -}}
{{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }}
<h2 style="color:green">==========异常恢复==========</h2>
告警程序: prometheus_alert <br>
故障主机: {{ .Labels.instance }}<br>
故障主题: {{ .Annotations.summary }}<br>
告警详情: {{ .Annotations.description }}<br>
告警时间: {{ .StartsAt.Local.Format "2006-01-02 15:04:05" }}<br>
恢复时间: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
{{ end }}{{ end -}}
{{- end }}
{{ define "email03.to.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }}
<h2 style="color:red">==========异常告警==========</h2>
告警程序: prometheus_alert <br>
告警级别: {{ .Labels.severity }} 级 <br>
告警主题: {{ .Annotations.summary }} <br>
告警详情: {{ .Annotations.description }} <br>
触发时间: {{ .StartsAt.Local.Format "2006-01-02 15:04:05" }} <br>
{{ end }}{{ end -}}
{{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }}
<h2 style="color:green">==========异常恢复==========</h2>
告警程序: prometheus_alert <br>
告警级别: {{ .Labels.severity }} 级 <br>
故障主题: {{ .Annotations.summary }}<br>
告警详情: {{ .Annotations.description }}<br>
告警时间: {{ .StartsAt.Local.Format "2006-01-02 15:04:05" }}<br>
恢复时间: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
{{ end }}{{ end -}}
{{- end }}
5、配置成consul
1)采用分离部署,consul单独写一个docker-compose.yml,集群部署方式
2)vim docker-compose.yml
version: '3.7'
services:
consul1:
image: hashicorp/consul:latest
container_name: consul1
restart: always
command: agent -server -client=0.0.0.0 -bootstrap-expect=3 -node=consul1
volumes:
- ./consul1/data:/consul/data
- ./consul1/config:/consul/config
networks:
- prom
consul2:
image: hashicorp/consul:latest
container_name: consul2
restart: always
command: agent -server -client=0.0.0.0 -retry-join=consul1 -node=consul2
volumes:
- ./consul2/data:/consul/data
- ./consul2/config:/consul/config
networks:
- prom
consul3:
image: hashicorp/consul:latest
container_name: consul3
restart: always
command: agent -server -client=0.0.0.0 -retry-join=consul1 -node=consul3
volumes:
- ./consul3/data:/consul/data
- ./consul3/config:/consul/config
networks:
- prom
consul4:
image: hashicorp/consul:latest
container_name: consul4
restart: always
ports:
- 8500:8500
command: agent -client=0.0.0.0 -retry-join=consul1 -ui -node=client1
volumes:
- ./consul4/data:/consul/data
- ./consul4/config:/consul/config
networks:
- prom
networks:
prom:
driver: bridge
3) 配置自动注册脚本、hosts注册主机信息(注意这两个文件必须放在同一目录下)
[root@prometheus-2 consul]# cat hosts
prometheus X.X.X.X
prometheus-test X.X.X.X
test-nginx X.X.X.X
snipeit X.X.X.X
prometheus-2 X.X.X.X
--------------------------------------------------------------------------------------------
[root@prometheus-2 consul]# cat linux-node.sh
#!/bin/bash
CONSUL_SERVER="X.X.X.X"
while read -r host_name host_addr
do
payload='{"id": "'"$host_addr"'","name": "node-exporter","address": "'"$host_addr"'","port":9100,"tags": ["linux-node", "instance='"$host_name"'"],"checks": [{"http": "http://'"$host_addr"':9100/","interval": "15s"}]}'
curl -X PUT -d "$payload" "http://$CONSUL_SERVER:8500/v1/agent/service/register"
done < hosts
-------------------------------------------------------------------------------------------
6、配置本地实体物理机
1)进入vm_config目录(注意,每台实体机监控都需要配置一个地址账号密码环境变量)
[root@prometheus-2 vm_config]# ls
vm2_config.env vm3_config.env vm_config.env
[root@prometheus-2 vm_config]# cat *
VSPHERE_USER=root
VSPHERE_PASSWORD=x.x.x.x
VSPHERE_HOST=x.x.x.x
VSPHERE_IGNORE_SSL=TRUE
VSPHERE_SPECS_SIZE=2000
----------------------------------------------------------------------------------------------------------
VSPHERE_USER=root
VSPHERE_PASSWORD=x.x.x.x
VSPHERE_HOST=x.x.x.x
VSPHERE_IGNORE_SSL=TRUE
VSPHERE_SPECS_SIZE=2000
----------------------------------------------------------------------------------------------------------
VSPHERE_USER=root
VSPHERE_PASSWORD=x.x.x.x
VSPHERE_HOST=x.x.x.x
VSPHERE_IGNORE_SSL=TRUE
VSPHERE_SPECS_SIZE=2000
----------------------------------------------------------------------------------------------------------
7、配置blackbox_exporter
1)拉取github项目 wget clone https://github.com/prometheus/blackbox_exporter.git
2)修改blackbox.yml配置文件 vim blackbox.yml
modules:
http_2xx:
prober: http
http:
preferred_ip_protocol: "ip4"
http_post_2xx:
prober: http
http:
method: POST
tcp_connect:
prober: tcp
pop3s_banner:
prober: tcp
tcp:
query_response:
- expect: "^+OK"
tls: true
tls_config:
insecure_skip_verify: false
grpc:
prober: grpc
grpc:
tls: true
preferred_ip_protocol: "ip4"
grpc_plain:
prober: grpc
grpc:
tls: false
service: "service1"
ssh_banner:
prober: tcp
tcp:
query_response:
- expect: "^SSH-2.0-"
- send: "SSH-2.0-blackbox-ssh-check"
irc_banner:
prober: tcp
tcp:
query_response:
- send: "NICK prober"
- send: "USER prober prober prober :prober"
- expect: "PING :([^ ]+)"
send: "PONG ${1}"
- expect: ":[ ]+ 001"
icmp:
prober: icmp
icmp_ttl5:
prober: icmp
timeout: 5s
icmp:
ttl: 5
8、配置snmp_exporter
1)拉取github项目wget clone https://github.com/prometheus/snmp_exporter.git
2) 确保系统有GO环境、使用生成器生成配置信息
cd snmp_exporter/generator
make generator mibs
make generate
3)在mibs文件夹中添加被监控机器的mib文件信息
4)配置generator.yml文件
modules:
sangfor:
walk:
- sfSysDevName # 系统主机名称
- adStandByState # 双机主备状态
- sfCpuLoadLast1Min # cpu过去1分钟的平均负载
- sfCpuLoadLast5Min # cpu过去5分钟的平均负载
- sfCpuLoadLast15Min # cpu过去15分钟的平均负载
- sfCpuTemp # cpu温度
- sfSysTotalMemory # 内存总大小(KB)
- sfSysFreeMemory # 内存可用大小(KB)
- sfDiskSize # 磁盘大小(M)
- sfFilesystemName # 磁盘分区名称
- sfDiskUsed # 磁盘使用的空间(M)
- sfDiskAvail # 磁盘剩余空间(M)
- sfDiskUsedPercent # 磁盘使用率(%)
- sfDeviceStatus # 磁盘状态
- sfFanName # 风扇名称
- sfFanSpeed # 风扇转速
- sfFanState # 风扇状态
- sfPowerState # 电源状态
- adConns # 系统并发连接数
- adNewConns # 系统新建连接数
- adVsConns # 所有虚拟服务并发连接数
- adVsNewConns # 所有虚拟服务新建连接数
- adUplinkThroughput # 所有链路上行流量(整型)
- adDownlinkThroughput # 所有链路下行流量 (整型)
- adMemCostRate # 内存使用率
- adHttpRequest # 当前设备http请求速率
- adVsNumber # 虚拟服务数量
- adPoolNumber # 节点池数量
- adNodeNumber # 节点数量
- adLinkName # 链路名称
- adLinkType # 链路类型
- adLinkIfName # 链路引用的网口
- adLinkStatus # 链路状态,0为离线,1为正常,2为繁忙
- adLinkBitIn # 链路上行流量
- adLinkBitOut # 链路下行流量
- adLinkNumber # 设备链路个数
- adCpuCostRate # CPU使用率
- adUptime # 系统运行时间
- adInterfaceName # 网口名称
- adInterfaceBitIn # 网口上行数据
max_repetitions: 25
retries: 3
timeout: 5s
version: 2 #snmpV2版本
auth:
community: public #设备团体名
lookups:
- source_indexes: [LinkIndex]
lookup: adLinkType - source_indexes: [LinkIndex]
lookup: adLinkIfName - source_indexes: [LinkIndex]
lookup: adLinkName
overrides:
sfSysDevName:
type: DisplayString
sfSysCpuCostRate:
type: DisplayString
sfCpuLoadLast1Min:
type: DisplayString
sfCpuTemp:
type: DisplayString
adStandByState:
type: DisplayString
adLinkName:
type: DisplayString
5)使用配置器生成snmp.yml配置文件 (用docker生成)
docker run -it -v "${PWD}:/opt/" prom/snmp-generator:master generate
9、配置nginx代理
1)进入nginx文件夹修改配置信息
目录结构[root@prometheus-2 nginx]# tree
├── conf.d
│ └── default.conf #nginx.config配置文件
├── html
│ └── index.html #访问展示页面
└── image #存放image图像
├── alertmanager.png
├── consul.png
├── grafana.jpg
├── x.x.x.x.png
└── prometheus.png
vim nginx/conf.d/default.conf
server { listen 80; listen [::]:80; server_name localhost; #access_log /var/log/nginx/host.access.log main; location / { root /usr/share/nginx/html; index index.html index.htm; } #error_page 404 /404.html; # redirect server error pages to the static page /50x.html # error_page 500 502 503 504 /50x.html; location = /50x.html { root /usr/share/nginx/html; } # Add a new location for image storage #配置可以识别image图片 location /image/ { alias /image/; } # proxy the PHP scripts to Apache listening on 127.0.0.1:80 # #location ~ \.php$ { # proxy_pass http://127.0.0.1; #} # pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000 # #location ~ \.php$ { # root html; # fastcgi_pass 127.0.0.1:9000; # fastcgi_index index.php; # fastcgi_param SCRIPT_FILENAME /scripts$fastcgi_script_name; # include fastcgi_params; #} # deny access to .htaccess files, if Apache's document root # concurs with nginx's one # #location ~ /\.ht { # deny all; #}
2)配置访问展示页面
- [root@prometheus-2 html]# cat index.html
- <!DOCTYPE html>
- <html>
- <head>
- <title>Monitoring</title>
- <style>
- .app-wrapper {
- display: flex;
- justify-content: center;
- align-items: center;
- }
- .app-container {
- width: 250px;
- height: 300px;
- margin: 10px;
- padding: 10px;
- border: 1px solid #ccc;
- border-radius: 5px;
- text-align: center;
- }
- .app-container img {
- width: 150px;
- height: 150px;
- object-fit: contain;
- margin-bottom: 10px;
- }
- .app-description {
- margin-bottom: 10px;
- }
- .app-button-container {
- display: flex;
- justify-content: center;
- }
- .app-button {
- display: inline-block;
- padding: 8px 16px;
- background-color: #4CAF50;
- color: white;
- text-decoration: none;
- border-radius: 4px;
- }
- </style>
- </head>
- <body>
- <div class="app-wrapper">
- <div class="app-container">
- <a href="http://X.X.X.X:9090" target="_blank">
- <img src="/image/prometheus.png" alt="prometheus">
- </a>
- <div class="app-description">Prometheus Monitoring</div>
- <div class="app-button-container">
- <a href="http://X.X.X.X:9090" class="app-button" target="_blank">Click to Enter</a>
- </div>
- </div>
- <div class="app-container">
- <a href="http://X.X.X.X:3000" target="_blank">
- <img src="/image/grafana.jpg" alt="grafana">
- </a>
- <div class="app-description">Grafana Dashboard</div>
- <div class="app-button-container">
- <a href="http://X.X.X.X:3000" class="app-button" target="_blank">Click to Enter</a>
- </div>
- </div>
- <div class="app-container">
- <a href="http://X.X.X.X:9093" target="_blank">
- <img src="/image/alertmanager.png" alt="alertmanager">
- </a>
- <div class="app-description">Alertmanager Notifications</div>
- <div class="app-button-container">
- <a href="http://X.X.X.X:9093" class="app-button" target="_blank">Click to Enter</a>
- </div>
- </div>
- <div class="app-container">
- <a href="http://X.X.X.X:8500" target="_blank">
- <img src="/image/consul.png" alt="consul">
- </a>
- <div class="app-description">Consul Service Discovery</div>
- <div class="app-button-container">
- <a href="http://X.X.X.X:8500" class="app-button" target="_blank">Click to Enter</a>
- </div>
- </div>
- <div class="app-container">
- <a href="https://x.x.x.x:3000" target="_blank">
- <img src="/image/IOA.png" alt="grafana">
- </a>
- <div class="app-description">IOA-Grafana</div>
- <div class="app-button-container">
- <a href="https://x.x.x.x:3000" class="app-button" target="_blank">Click to Enter</a>
- </div>
- </div>
- </div>
- </body>
- </html>
10、可用性验证
1)访问nginx代理域名
2)prometheus展示
3)grafana展示
4)alertmanager展示
5)consul展示