目标:如果服务器GPU空置,可以及时推送消息到飞书群。
其他类似的监控目标也可以修改代码实现。
步骤:
(1) 首先在群聊设置加入机器人,复制webhook_url
(2) 在服务器后台运行如下代码。注意替换webhook_url
"""
nohup python monitor_gpu.py my_server > monitor_gpu.log 2>&1 &
"""
import sys
import subprocess
import requests
import time
from datetime import datetime
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# 飞书机器人 webhook 地址
feishu_webhook_url = "https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxx"
if len(sys.argv) < 2:
logger.error(f"Usage: python {__file__} <machine_id>")
sys.exit(1)
machine_id = sys.argv[1]
def check_gpu_usage(util_threshold=10):
# 执行 nvidia-smi 命令获取显卡使用情况
result = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total', '--format=csv,noheader,nounits'], stdout=subprocess.PIPE)
output = result.stdout.decode('utf-8').strip()
# 解析输出
utils = []
mems = []
for info in output.split('\n'):
if info:
gpu_utilization, memory_used, memory_total = map(int, info.split(', '))
utils.append(gpu_utilization)
mems.append(memory_used)
gpu_utilization_avg = sum(utils) / len(utils)
memory_used_avg = sum(mems) / len(mems)
# 判断是否空闲(假设利用率小于10%为空闲)
if gpu_utilization_avg < util_threshold:
return True, f"⚠️ {machine_id} GPU空闲:平均利用率 {gpu_utilization_avg}% 显存 {memory_used_avg/1000} GB"
return False, f"✅ {machine_id} GPUs are in usage."
def send_feishu_message(message):
headers = {
"Content-Type": "application/json"
}
data = {
"timestamp": int(datetime.now().timestamp()),
"msg_type": "text",
"content": {
"text": message
}
}
t = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
response = requests.post(feishu_webhook_url, json=data, headers=headers)
response.raise_for_status()
result = response.json()
if result.get("code") and result.get("code") != 0:
logger.info(f"{t} 发送\"{message}\"失败:{result['msg']}")
else:
logger.info(f"{t} 发送\"{message}\"成功")
send_feishu_message(f"🎉 {machine_id} GPU空闲状态监控已开启! 初始化状态为:GPU正在被使用中")
state_prev = False
max_retry = 3
while True:
# 单次最多检查k次,全空才视为空
is_idle, message = None, None
for _ in range(max_retry):
is_idle, message = check_gpu_usage()
if not is_idle:
break
time.sleep(60)
if is_idle ^ state_prev:
send_feishu_message(message)
state_prev = is_idle
time.sleep(10 * 60) # 每10分钟检查一次