Bootstrap

docker-compose安装prometheus告警系统

一、概述

本文只有监控与告警的安装、告警发送、发送模版的配置。没有数据展示监控数据UI工具

一、docker-compose

1)docker-compose.yaml

version: '3.0'
services:
  #1.prometheus
  prometheus:
    image: prom/prometheus
    container_name: prometheus
    restart: always
    volumes:
      - $PWD/config/prometheus.yml:/etc/prometheus/prometheus.yml
      - $PWD/config/rules:/etc/prometheus/rules
      - $PWD/config/targets:/etc/prometheus/targets
    command:
      - --web.enable-lifecycle
      - --config.file=/etc/prometheus/prometheus.yml
    ports:
      - 9090:9090
    networks:
      - prometheus
    environment:
      - TZ=Asia/Shanghai

  #2.alertmanager
  alertmanager:
    image: prom/alertmanager
    container_name: alertmanager
    restart: always
    volumes:
      - $PWD/config/alertmanager.yml:/etc/alertmanager/alertmanager.yml
      - $PWD/config/template:/alertmanager/template
    ports:
      - 9093:9093
    networks:
      - prometheus
    environment:
      - TZ=Asia/Shanghai

networks:
  prometheus:
    driver: bridge

启动后prometheus访问地址:http://192.168.1.1:9090/graph?g0.expr=&g0.tab=1&g0.stacked=0&g0.range_input=1h
在这里插入图片描述
alertmanager访问地址:http://192.168.1.1:9093/#/alerts
在这里插入图片描述

二、配置文件

1)prometheus.yml

global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).
  
# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "rules/*"
# - "second_rules.yml"

# remote_write:
#   - url: http://192.168.50.200:8080/prometheus

scrape_configs:
  - job_name: prometheus
    static_configs:
      - targets: ['192.168.1.1:9090']
        labels:
          instance: prometheus

  - job_name: 'node_exporter'
    file_sd_configs:
    - files:
      - targets/*.yml
    

2)alertmanager.yml


global:
  # smtp_smarthost: 'smtp.exmail.qq.com:587'
  # smtp_from: '***'                # 谁发邮件
  # smtp_auth_username: '***'      # 邮箱用户
  # smtp_auth_password: '***'                 # 邮箱密码
  # smtp_require_tls: true
  smtp_smarthost: 'smtp.163.com:465'
  smtp_from: '***'               # 谁发邮件
  smtp_auth_username: '***'       # 邮箱用户
  smtp_auth_password: '***'                   # 邮箱密码
  smtp_require_tls: false

route:
  group_by: ["instance"]            # 分组名
  group_wait: 10s                   # 当收到告警的时候,等待三十秒看是否还有告警,如果有就一起发出去
  group_interval: 10s                # 发送警告间隔时间
  repeat_interval: 1h              # 重复报警的间隔时间
  receiver: mail                    # 全局报警组,这个参数是必选的,和下面报警组名要相同

templates:
  - '/alertmanager/template/*.tmpl'
  
receivers:
- name: 'mail'                      # 报警组名
  email_configs:
  - to: '****'      # 发送给谁,多个逗号隔开
    html: '{{ template "alert.html" . }}'
    headers: { Subject: "[WARN] 监控报警" }
 

3)targets配置,建目标服务器配置文件node_targets.yml

- targets:
  - 192.168.1.*:9100
  labels:
    host: aaa
- targets:
  - 192.168.1.*:9100
  labels:
    host: bbb

4)常用告警规则,以node_exporter 为例node_down.yml

groups:
- name: 实例存活告警规则
  rules:
  - alert: 实例存活告警
    expr: up == 0
    for: 1m
    labels:
      user: prometheus
      severity: error
    annotations:
      summary: "机器 {{ $labels.instance }} 挂了"
      description: "{{ $labels.instance }} 已停止超过1分钟"

- name: CPU报警规则
  rules:
  - alert: CPU使用率告警
    expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 90
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      description: "服务器: CPU使用超过90%!(当前值: {{ $value }}%)"
      summary: "机器 {{ $labels.instance }} CPU使用超过90%"

- name: 内存报警规则
  rules:
  - alert: 内存使用率告警
    expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      description: "服务器: 内存使用超过80%!(当前值: {{ $value }}%)"
      summary: "机器 {{ $labels.instance }} 内存使用超过80%"

- name: 磁盘报警规则
  rules:
  - alert: 磁盘使用率告警
    expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 80
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      description: "服务器: 磁盘设备: 使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
      summary: "机器 {{ $labels.instance }} 磁盘使用超过80%"

5)告警模版 alert.tmpl

{{ define "alert.html" }}
{{ range .Alerts }}
<table border="1">
  <tr>
    <th> 告警通知 </th>
    <th>  厂区prometheus监控告警通知 </th>
  </tr>
  <tr>
    <td> 告警级别 </td>
    <td>{{ .Labels.severity }}</td>
  </tr>

  <tr>
    <td> 告警类型 </td>
    <td>{{ .Labels.alertname }}</td>
  </tr>
  <tr>
    <td> 故障主机 </td>
    <td>{{ .Labels.instance }}</td>
  </tr>

  <tr>
    <td> 告警主题 </td>
    <td>{{ .Annotations.summary }}</td>
  </tr>

  <tr>
    <td> 告警详情 </td>
    <td>{{ .Annotations.description }}</td>
  </tr>

  <tr>
    <td> 触发时间 </td>
    <td>{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}</td>
  </tr>

</table>
{{ end }}
{{ end }}
;