docker-compose安装Prometheus
一、概述
本文只有监控与告警的安装、告警发送、发送模版的配置。没有数据展示监控数据UI工具
一、docker-compose
1)docker-compose.yaml
version: '3.0'
services:
#1.prometheus
prometheus:
image: prom/prometheus
container_name: prometheus
restart: always
volumes:
- $PWD/config/prometheus.yml:/etc/prometheus/prometheus.yml
- $PWD/config/rules:/etc/prometheus/rules
- $PWD/config/targets:/etc/prometheus/targets
command:
- --web.enable-lifecycle
- --config.file=/etc/prometheus/prometheus.yml
ports:
- 9090:9090
networks:
- prometheus
environment:
- TZ=Asia/Shanghai
#2.alertmanager
alertmanager:
image: prom/alertmanager
container_name: alertmanager
restart: always
volumes:
- $PWD/config/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- $PWD/config/template:/alertmanager/template
ports:
- 9093:9093
networks:
- prometheus
environment:
- TZ=Asia/Shanghai
networks:
prometheus:
driver: bridge
启动后prometheus访问地址:http://192.168.1.1:9090/graph?g0.expr=&g0.tab=1&g0.stacked=0&g0.range_input=1h
alertmanager访问地址:http://192.168.1.1:9093/#/alerts
二、配置文件
1)prometheus.yml
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*"
# - "second_rules.yml"
# remote_write:
# - url: http://192.168.50.200:8080/prometheus
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ['192.168.1.1:9090']
labels:
instance: prometheus
- job_name: 'node_exporter'
file_sd_configs:
- files:
- targets/*.yml
2)alertmanager.yml
global:
# smtp_smarthost: 'smtp.exmail.qq.com:587'
# smtp_from: '***' # 谁发邮件
# smtp_auth_username: '***' # 邮箱用户
# smtp_auth_password: '***' # 邮箱密码
# smtp_require_tls: true
smtp_smarthost: 'smtp.163.com:465'
smtp_from: '***' # 谁发邮件
smtp_auth_username: '***' # 邮箱用户
smtp_auth_password: '***' # 邮箱密码
smtp_require_tls: false
route:
group_by: ["instance"] # 分组名
group_wait: 10s # 当收到告警的时候,等待三十秒看是否还有告警,如果有就一起发出去
group_interval: 10s # 发送警告间隔时间
repeat_interval: 1h # 重复报警的间隔时间
receiver: mail # 全局报警组,这个参数是必选的,和下面报警组名要相同
templates:
- '/alertmanager/template/*.tmpl'
receivers:
- name: 'mail' # 报警组名
email_configs:
- to: '****' # 发送给谁,多个逗号隔开
html: '{{ template "alert.html" . }}'
headers: { Subject: "[WARN] 监控报警" }
3)targets配置,建目标服务器配置文件node_targets.yml
- targets:
- 192.168.1.*:9100
labels:
host: aaa
- targets:
- 192.168.1.*:9100
labels:
host: bbb
4)常用告警规则,以node_exporter 为例node_down.yml
groups:
- name: 实例存活告警规则
rules:
- alert: 实例存活告警
expr: up == 0
for: 1m
labels:
user: prometheus
severity: error
annotations:
summary: "机器 {{ $labels.instance }} 挂了"
description: "{{ $labels.instance }} 已停止超过1分钟"
- name: CPU报警规则
rules:
- alert: CPU使用率告警
expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 90
for: 1m
labels:
user: prometheus
severity: warning
annotations:
description: "服务器: CPU使用超过90%!(当前值: {{ $value }}%)"
summary: "机器 {{ $labels.instance }} CPU使用超过90%"
- name: 内存报警规则
rules:
- alert: 内存使用率告警
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
for: 1m
labels:
user: prometheus
severity: warning
annotations:
description: "服务器: 内存使用超过80%!(当前值: {{ $value }}%)"
summary: "机器 {{ $labels.instance }} 内存使用超过80%"
- name: 磁盘报警规则
rules:
- alert: 磁盘使用率告警
expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 80
for: 1m
labels:
user: prometheus
severity: warning
annotations:
description: "服务器: 磁盘设备: 使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
summary: "机器 {{ $labels.instance }} 磁盘使用超过80%"
5)告警模版 alert.tmpl
{{ define "alert.html" }}
{{ range .Alerts }}
<table border="1">
<tr>
<th> 告警通知 </th>
<th> 厂区prometheus监控告警通知 </th>
</tr>
<tr>
<td> 告警级别 </td>
<td>{{ .Labels.severity }}</td>
</tr>
<tr>
<td> 告警类型 </td>
<td>{{ .Labels.alertname }}</td>
</tr>
<tr>
<td> 故障主机 </td>
<td>{{ .Labels.instance }}</td>
</tr>
<tr>
<td> 告警主题 </td>
<td>{{ .Annotations.summary }}</td>
</tr>
<tr>
<td> 告警详情 </td>
<td>{{ .Annotations.description }}</td>
</tr>
<tr>
<td> 触发时间 </td>
<td>{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}</td>
</tr>
</table>
{{ end }}
{{ end }}