zabbix对接千问大模型,主要有2个场景:
- 使用大模型对告警信息进行实时分析
- 将告警分析结果发送到第三方平台(钉钉/微信)
备注:本文对接的大模型为通义千问大模型,大模型部署在本地机房GPU服务器上,无token和调用次数限制。
1、准备脚本
[root@zabbixweb alertscripts]# pwd
/usr/lib/zabbix/alertscripts
[root@zabbixweb alertscripts]# cat zabbix_qwen.sh
#!/bin/bash
# 检查是否提供了内容参数
if [ $# -eq 0 ]; then
echo "请提供内容参数!"
exit 1
fi
# 获取告警信息,并构建最终的请求内容
user_content=$1
final_content="请分析以下Zabbix告警信息,并提供可能的原因和解决建议:\n ${user_content}"
# API URL
api_url="http://192.168.21.200:8000/v1/chat/completions"
# 生成 JSON 请求
json_request=$(cat <<EOF
{
"model": "LLM72B",
"messages": [
{
"role": "user",
"content": "${final_content}"
}
]
}
EOF
)
# 检查是否安装了 jq 工具
if ! command -v jq &> /dev/null; then
echo "jq 工具未安装,请安装后重试。"
exit 1
fi
# 使用 curl 命令发送 API 请求,并记录响应时间
response=$(curl -s -w "%{http_code} %{time_total}" -o response.json -X POST "${api_url}" \
--header 'Content-Type: application/json' \
--data "${json_request}")
# 从响应中提取 HTTP 状态码和响应时间
http_code=$(echo "$response" | awk '{print $1}')
response_time=$(echo "$response" | awk '{print $2}')
# 检查 HTTP 状态码是否为 200
if [ "$http_code" -ne 200 ]; then
echo "API 请求失败,状态码: $http_code"
cat response.json
rm -f response.json
exit 1
fi
# 从文件中读取响应内容
response_content=$(<response.json)
# 提取并格式化 AI 的回复内容
result=$(echo "$response_content" | jq -r '.choices[0].message.content')
# Markdown 输出格式
if [ -n "$result" ]; then
echo -e "$result\n"
else
echo "请求失败,响应内容: $response_content"
fi
# 打印响应时间
echo "接口响应时间: ${response_time}s"
# 清理临时文件
rm -f response.json
2、测试脚本
3、创建前端脚本
4、前端演示
5、告警分析对接钉钉
- 准备告警媒介脚本
[root@zabbixweb alertscripts]# cat zabbix_qwen_dingding.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
import json
import sys
import os
import time
from datetime import datetime, timedelta
from collections import deque
headers = {'Content-Type': 'application/json;charset=utf-8'}
current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
log_file = "/var/log/dingding.log"
# 钉钉机器人API
dingding_api_url = "替换为钉钉webhook地址"
# 通义千问API
qianwen_api_url = "http://192.168.21.200:8000/v1/chat/completions"
# 限速控制
message_queue = deque(maxlen=100) # 用于跟踪每天的消息
hourly_queue = deque(maxlen=20) # 用于跟踪每小时的消息
minute_queue = deque(maxlen=4) # 用于跟踪每分钟的消息
last_send_time = None # 用于跟踪上次发送时间
def log(info):
if not os.path.isfile(log_file):
open(log_file, 'a+').close()
with open(log_file, 'a+') as f:
f.write(f"{current_time}: {info}\n")
def can_send_message():
global last_send_time
now = datetime.now()
# 清理过期的记录
while message_queue and (now - message_queue[0]).days >= 1:
message_queue.popleft()
while hourly_queue and (now - hourly_queue[0]).seconds >= 3600:
hourly_queue.popleft()
while minute_queue and (now - minute_queue[0]).seconds >= 60:
minute_queue.popleft()
# 检查各种限制
if len(message_queue) >= 100:
log("达到每日消息限制(100条/天)")
return False
if len(hourly_queue) >= 20:
log("达到每小时消息限制(20条/小时)")
return False
if len(minute_queue) >= 4:
log("达到每分钟消息限制(4条/分钟)")
return False
if last_send_time and (now - last_send_time).seconds < 1:
log("达到每秒消息限制(1条/秒)")
return False
return True
def wait_for_next_slot():
while not can_send_message():
time.sleep(1)
def analyze_with_qianwen(subject, message):
json_request = {
"model": "LLM72B",
"messages": [
{
"role": "user",
"content": f"请分析以下Zabbix告警信息,并提供可能的原因和解决建议:\n主题: {subject}\n详细信息: {message}"
}
]
}
try:
response = requests.post(qianwen_api_url, json=json_request, headers=headers)
response.raise_for_status()
result = response.json()
return result['choices'][0]['message']['content']
except requests.exceptions.RequestException as e:
log(f"通义千问API调用失败: {str(e)}")
return "通义千问分析失败,请人工检查告警信息。"
def send_dingding_msg(subject, message, user):
global last_send_time
wait_for_next_slot()
analysis = analyze_with_qianwen(subject, message)
json_text = {
"msgtype": "markdown",
"markdown": {
"title": "Zabbix监控告警",
"text": f"### 告警主题\n{subject}\n\n### 告警详情\n{message}\n\n### 通义千问分析\n{analysis}"
},
"at": {
"atMobiles": [user] if user else [],
"isAtAll": False
}
}
try:
response = requests.post(dingding_api_url, data=json.dumps(json_text), headers=headers)
response.raise_for_status()
result = response.json()
if result["errcode"] == 0:
log("消息发送成功")
now = datetime.now()
message_queue.append(now)
hourly_queue.append(now)
minute_queue.append(now)
last_send_time = now
else:
error_message = result.get("errmsg", "未知错误")
log(f"消息发送失败,错误码: {result['errcode']}, 错误信息: {error_message}")
if result["errcode"] == 300005:
log("可能原因: 机器人被禁用。请检查以下几点:")
log("1. 钉钉机器人的安全设置是否正确配置")
log("2. 是否触发了消息发送频率限制")
log("3. 如果设置了IP白名单,请确保Zabbix服务器的IP在白名单中")
log("4. 检查访问令牌是否过期或被重置")
raise Exception(f"钉钉API返回错误: {error_message}")
except requests.exceptions.RequestException as e:
log(f"钉钉API调用失败: {str(e)}")
raise
if __name__ == '__main__':
if len(sys.argv) != 4:
log(f"参数不足: 收到 {len(sys.argv) - 1} 个参数,期望 3 个")
print(f"Usage: {sys.argv[0]} <ALERT.MESSAGE> <ALERT.SUBJECT> <ALERT.SENDTO>")
sys.exit(1)
message = sys.argv[1]
subject = sys.argv[2]
user = sys.argv[3]
try:
send_dingding_msg(subject, message, user)
except Exception as e:
log(f"脚本执行失败: {str(e)}")
sys.exit(1)