Bootstrap

40分钟学 Go 语言高并发:服务监控与追踪

服务监控与追踪

一、知识要点总览

模块核心内容技术选型难度
监控指标请求量、响应时间、错误率、资源使用Prometheus + Grafana
链路追踪分布式调用链、性能瓶颈分析Jaeger, OpenTelemetry
日志处理日志收集、分析、存储ELK Stack
告警系统告警规则、通知渠道、告警分级AlertManager

二、详细实现

1. 监控指标采集系统

让我们先看监控指标的实现:

// metrics/collector.go
package metrics

import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promauto"
    "sync"
    "time"
)

type Collector struct {
    // HTTP请求相关指标
    requestCounter   *prometheus.CounterVec
    requestDuration  *prometheus.HistogramVec
    requestInFlight  *prometheus.GaugeVec
    
    // 系统资源指标
    cpuUsage        *prometheus.GaugeVec
    memoryUsage     *prometheus.GaugeVec
    goroutineCount  prometheus.Gauge
    
    // 业务指标
    businessCounter  *prometheus.CounterVec
    queueLength     *prometheus.GaugeVec
    
    // 错误指标
    errorCounter    *prometheus.CounterVec
    
    mu sync.RWMutex
}

func NewCollector(namespace string) *Collector {
    return &Collector{
        requestCounter: promauto.NewCounterVec(
            prometheus.CounterOpts{
                Namespace: namespace,
                Name:      "http_requests_total",
                Help:      "Total number of HTTP requests",
            },
            []string{"method", "path", "status"},
        ),
        
        requestDuration: promauto.NewHistogramVec(
            prometheus.HistogramOpts{
                Namespace: namespace,
                Name:      "http_request_duration_seconds",
                Help:      "HTTP request duration in seconds",
                Buckets:   []float64{0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.5, 2.0},
            },
            []string{"method", "path"},
        ),
        
        requestInFlight: promauto.NewGaugeVec(
            prometheus.GaugeOpts{
                Namespace: namespace,
                Name:      "http_requests_in_flight",
                Help:      "Current number of HTTP requests being processed",
            },
            []string{"method"},
        ),
        
        cpuUsage: promauto.NewGaugeVec(
            prometheus.GaugeOpts{
                Namespace: namespace,
                Name:      "cpu_usage_percent",
                Help:      "Current CPU usage percentage",
            },
            []string{"core"},
        ),
        
        memoryUsage: promauto.NewGaugeVec(
            prometheus.GaugeOpts{
                Namespace: namespace,
                Name:      "memory_usage_bytes",
                Help:      "Current memory usage in bytes",
            },
            []string{"type"},
        ),
        
        goroutineCount: promauto.NewGauge(
            prometheus.GaugeOpts{
                Namespace: namespace,
                Name:      "goroutine_count",
                Help:      "Current number of goroutines",
            },
        ),
        
        businessCounter: promauto.NewCounterVec(
            prometheus.CounterOpts{
                Namespace: namespace,
                Name:      "business_operations_total",
                Help:      "Total number of business operations",
            },
            []string{"operation", "status"},
        ),
        
        queueLength: promauto.NewGaugeVec(
            prometheus.GaugeOpts{
                Namespace: namespace,
                Name:      "queue_length",
                Help:      "Current queue length",
            },
            []string{"queue"},
        ),
        
        errorCounter: promauto.NewCounterVec(
            prometheus.CounterOpts{
                Namespace: namespace,
                Name:      "errors_total",
                Help:      "Total number of errors",
            },
            []string{"type"},
        ),
    }
}

// RecordRequest 记录HTTP请求
func (c *Collector) RecordRequest(method, path string, status int, duration time.Duration) {
    c.requestCounter.WithLabelValues(method, path, string(status)).Inc()
    c.requestDuration.WithLabelValues(method, path).Observe(duration.Seconds())
}

// TrackInFlightRequests 跟踪正在处理的请求
func (c *Collector) TrackInFlightRequests(method string, delta int) {
    c.requestInFlight.WithLabelValues(method).Add(float64(delta))
}

// UpdateCPUUsage 更新CPU使用率
func (c *Collector) UpdateCPUUsage(core string, usage float64) {
    c.cpuUsage.WithLabelValues(core).Set(usage)
}

// UpdateMemoryUsage 更新内存使用情况
func (c *Collector) UpdateMemoryUsage(memType string, bytes float64) {
    c.memoryUsage.WithLabelValues(memType).Set(bytes)
}

// UpdateGoroutineCount 更新goroutine数量
func (c *Collector) UpdateGoroutineCount(count int64) {
    c.goroutineCount.Set(float64(count))
}

// RecordBusinessOperation 记录业务操作
func (c *Collector) RecordBusinessOperation(operation, status string) {
    c.businessCounter.WithLabelValues(operation, status).Inc()
}

// UpdateQueueLength 更新队列长度
func (c *Collector) UpdateQueueLength(queue string, length int) {
    c.queueLength.WithLabelValues(queue).Set(float64(length))
}

// RecordError 记录错误
func (c *Collector) RecordError(errorType string) {
    c.errorCounter.WithLabelValues(errorType).Inc()
}

2. 链路追踪实现

接下来实现分布式链路追踪:

// tracing/tracer.go
package tracing

import (
    "context"
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/exporters/jaeger"
    "go.opentelemetry.io/otel/sdk/resource"
    tracesdk "go.opentelemetry.io/otel/sdk/trace"
    semconv "go.opentelemetry.io/otel/semconv/v1.7.0"
    "go.opentelemetry.io/otel/trace"
)

type Tracer struct {
    tracer     trace.Tracer
    provider   *tracesdk.TracerProvider
}

func NewTracer(serviceName, jaegerEndpoint string) (*Tracer, error) {
    // 创建Jaeger导出器
    exporter, err := jaeger.New(jaeger.WithCollectorEndpoint(
        jaeger.WithEndpoint(jaegerEndpoint),
    ))
    if err != nil {
        return nil, err
    }

    // 创建资源
    res, err := resource.New(context.Background(),
        resource.WithAttributes(
            semconv.ServiceNameKey.String(serviceName),
            attribute.String("environment", "production"),
        ),
    )
    if err != nil {
        return nil, err
    }

    // 创建TracerProvider
    provider := tracesdk.NewTracerProvider(
        tracesdk.WithBatcher(exporter),
        tracesdk.WithResource(res),
        tracesdk.WithSampler(tracesdk.AlwaysSample()),
    )

    // 设置全局TracerProvider
    otel.SetTracerProvider(provider)

    return &Tracer{
        tracer:   provider.Tracer(serviceName),
        provider: provider,
    }, nil
}

// StartSpan 开启新的追踪Span
func (t *Tracer) StartSpan(ctx context.Context, name string, opts ...trace.SpanStartOption) (context.Context, trace.Span) {
    return t.tracer.Start(ctx, name, opts...)
}

// Shutdown 关闭追踪器
func (t *Tracer) Shutdown(ctx context.Context) error {
    return t.provider.Shutdown(ctx)
}

// HTTP中间件
func (t *Tracer) HTTPMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        spanCtx, span := t.StartSpan(r.Context(), "http_request",
            trace.WithAttributes(
                attribute.String("http.method", r.Method),
                attribute.String("http.url", r.URL.String()),
            ),
        )
        defer span.End()

        // 包装ResponseWriter以捕获状态码
        ww := NewResponseWriter(w)
        
        // 使用新的上下文调用下一个处理器
        next.ServeHTTP(ww, r.WithContext(spanCtx))
        
        // 记录响应状态码
        span.SetAttributes(attribute.Int("http.status_code", ww.statusCode))
    })
}

// gRPC拦截器
func (t *Tracer) UnaryServerInterceptor() grpc.UnaryServerInterceptor {
    return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, 
        handler grpc.UnaryHandler) (interface{}, error) {
        
        spanCtx, span := t.StartSpan(ctx, info.FullMethod,
            trace.WithAttributes(
                attribute.String("rpc.system", "grpc"),
                attribute.String("rpc.method", info.FullMethod),
            ),
        )
        defer span.End()

        resp, err := handler(spanCtx, req)
        if err != nil {
            span.RecordError(err)
        }
        return resp, err
    }
}

// ResponseWriter包装器
type ResponseWriter struct {
    http.ResponseWriter
    statusCode int
}

func NewResponseWriter(w http.ResponseWriter) *ResponseWriter {
    return &ResponseWriter{ResponseWriter: w, statusCode: http.StatusOK}
}

func (w *ResponseWriter) WriteHeader(code int) {
    w.statusCode = code
    w.ResponseWriter.WriteHeader(code)
}

3. 日志处理系统

现在来实现日志处理系统:

// logging/logger.go
package logging

import (
    "context"
    "encoding/json"
    "github.com/olivere/elastic/v7"
    "go.uber.org/zap"
    "go.uber.org/zap/zapcore"
    "time"
)

type Logger struct {
    zap     *zap.Logger
    elastic *elastic.Client
}

type LogEntry struct {
    Timestamp   time.Time              `json:"@timestamp"`
    Level       string                 `json:"level"`
    Message     string                 `json:"message"`
    ServiceName string                 `json:"service_name"`
    TraceID     string                 `json:"trace_id"`
    SpanID      string                 `json:"span_id"`
    Fields      map[string]interface{} `json:"fields,omitempty"`
}

func NewLogger(serviceName string, elasticURL string) (*Logger, error) {
    // 配置zap logger
    config := zap.NewProductionConfig()
    config.EncoderConfig.TimeKey = "@timestamp"
    config.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder
    
    zapLogger, err := config.Build(
        zap.AddCallerSkip(1),
        zap.Fields(zap.String("service", serviceName)),
    )
    if err != nil {
        return nil, err
    }

    // 创建Elasticsearch客户端
    client, err := elastic.NewClient(
        elastic.SetURL(elasticURL),
        elastic.SetSniff(false),
    )
    if err != nil {
        return nil, err
    }

    return &Logger{
        zap:     zapLogger,
        elastic: client,
    }, nil
}

func (l *Logger) Info(ctx context.Context, msg string, fields ...zap.Field) {
    // 添加追踪信息
    fields = append(fields, l.extractTraceInfo(ctx)...)
    
    // 本地日志记录
    l.zap.Info(msg, fields...)
    
    // 异步发送到Elasticsearch
    go l.sendToElasticsearch("info", msg, fields)
}

func (l *Logger) Error(ctx context.Context, msg string, fields ...zap.Field) {
    fields = append(fields, l.extractTraceInfo(ctx)...)
    l.zap.Error(msg, fields...)
    go l.sendToElasticsearch("error", msg, fields)
}

func (l *Logger) extractTraceInfo(ctx context.Context) []zap.Field {
    var fields []zap.Field
    if span := trace.SpanFromContext(ctx); span != nil {
        spanCtx := span.SpanContext()
        fields = append(fields,
            zap.String("trace_id", spanCtx.TraceID().String()),
            zap.String("span_id", spanCtx.SpanID().String()),
        )
    }
    return fields
}

func (l *Logger) sendToElasticsearch(level string, msg string, fields []zap.Field) {
    // 构建日志条目
    entry := LogEntry{
        Timestamp:   time.Now(),
        Level:       level,
        Message:     msg,
        ServiceName: "service_name",
        Fields:      make(map[string]interface{}),
    }

    // 提取字段信息
    for _, field := range fields {
        entry.Fields[field.Key] = field.Interface
    }

    // 发送到Elasticsearch
    _, err := l.elastic.Index().
        Index(fmt.Sprintf("logs-%s", time.Now().Format("2006.01.02"))).
        Type("_doc").
        BodyJson(entry).
        Do(context.Background())

    if err != nil {
        l.zap.Error("Failed to send log to elasticsearch",
            zap.Error(err),
            zap.Any("entry", entry),
        )
    }
}

// 日志查询功能
type QueryOptions struct {
    StartTime time.Time
    EndTime   time.Time
    Level     string
    Service   string
    TraceID   string
    Limit     int
    Offset    int
}

func (l *Logger) Query(ctx context.Context, opts QueryOptions) ([]LogEntry, error) {
    query := elastic.NewBoolQuery()
    
    // 添加时间范围
    if !opts.StartTime.IsZero() {
        query = query.Must(elastic.NewRangeQuery("@timestamp").Gte(opts.StartTime))
    }
    if !opts.EndTime.IsZero() {
        query = query.Must(elastic.NewRangeQuery("@timestamp").Lte(opts.EndTime))
    }
    
    // 添加日志级别过滤
    if opts.Level != "" {
        query = query.Must(elastic.NewTermQuery("level", opts.Level))
    }
    
    // 添加服务名过滤
    if opts.Service != "" {
        query = query.Must(elastic.NewTermQuery("service_name", opts.Service))
    }
    
    // 添加TraceID过滤
    if opts.TraceID != "" {
        query = query.Must(elastic.NewTermQuery("trace_id", opts.TraceID))
    }
    
    // 执行查询
    searchResult, err := l.elastic.Search().
        Index(fmt.Sprintf("logs-%s", time.Now().Format("2006.01.02"))).
        Query(query).
        Sort("@timestamp", false).
        From(opts.Offset).
        Size(opts.Limit).
        Do(ctx)
        
    if err != nil {
        return nil, err
    }
    
    // 解析结果
    var logs []LogEntry
    for _, hit := range searchResult.Hits.Hits {
        var entry LogEntry
        if err := json.Unmarshal(hit.Source, &entry); err != nil {
            return nil, err
        }
        logs = append(logs, entry)
    }
    
    return logs, nil
}

// 日志查询API处理器
type LogQueryHandler struct {
    logger *Logger
}

func NewLogQueryHandler(logger *Logger) *LogQueryHandler {
    return &LogQueryHandler{logger: logger}
}

func (h *LogQueryHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
    var opts QueryOptions
    
    // 解析查询参数
    startTime := r.URL.Query().Get("start_time")
    if startTime != "" {
        t, err := time.Parse(time.RFC3339, startTime)
        if err == nil {
            opts.StartTime = t
        }
    }
    
    endTime := r.URL.Query().Get("end_time")
    if endTime != "" {
        t, err := time.Parse(time.RFC3339, endTime)
        if err == nil {
            opts.EndTime = t
        }
    }
    
    opts.Level = r.URL.Query().Get("level")
    opts.Service = r.URL.Query().Get("service")
    opts.TraceID = r.URL.Query().Get("trace_id")
    
    limit := r.URL.Query().Get("limit")
    if limit != "" {
        if l, err := strconv.Atoi(limit); err == nil {
            opts.Limit = l
        }
    }
    
    offset := r.URL.Query().Get("offset")
    if offset != "" {
        if o, err := strconv.Atoi(offset); err == nil {
            opts.Offset = o
        }
    }
    
    // 执行查询
    logs, err := h.logger.Query(r.Context(), opts)
    if err != nil {
        http.Error(w, err.Error(), http.StatusInternalServerError)
        return
    }
    
    // 返回结果
    w.Header().Set("Content-Type", "application/json")
    json.NewEncoder(w).Encode(logs)
}

4. 告警系统实现

让我们来实现告警系统:

// alerting/alert.go
package alerting

import (
    "context"
    "time"
)

type AlertLevel string

const (
    AlertLevelInfo     AlertLevel = "info"
    AlertLevelWarning  AlertLevel = "warning"
    AlertLevelError    AlertLevel = "error"
    AlertLevelCritical AlertLevel = "critical"
)

type Alert struct {
    ID          string                 `json:"id"`
    Name        string                 `json:"name"`
    Level       AlertLevel             `json:"level"`
    Message     string                 `json:"message"`
    Labels      map[string]string      `json:"labels"`
    Value       float64                `json:"value"`
    Threshold   float64                `json:"threshold"`
    ServiceName string                 `json:"service_name"`
    Timestamp   time.Time              `json:"timestamp"`
    Metadata    map[string]interface{} `json:"metadata,omitempty"`
}

type AlertRule struct {
    Name         string
    Description  string
    Level        AlertLevel
    Expression   string
    Threshold    float64
    Duration     time.Duration
    Labels       map[string]string
    Annotations  map[string]string
}

type AlertManager struct {
    rules     []AlertRule
    notifiers []Notifier
    history   *AlertHistory
}

type Notifier interface {
    Notify(context.Context, *Alert) error
}

func NewAlertManager() *AlertManager {
    return &AlertManager{
        rules:     make([]AlertRule, 0),
        notifiers: make([]Notifier, 0),
        history:   NewAlertHistory(),
    }
}

func (am *AlertManager) AddRule(rule AlertRule) {
    am.rules = append(am.rules, rule)
}

func (am *AlertManager) AddNotifier(notifier Notifier) {
    am.notifiers = append(am.notifiers, notifier)
}

func (am *AlertManager) Start(ctx context.Context) {
    ticker := time.NewTicker(30 * time.Second)
    defer ticker.Stop()

    for {
        select {
        case <-ctx.Done():
            return
        case <-ticker.C:
            am.evaluate(ctx)
        }
    }
}

func (am *AlertManager) evaluate(ctx context.Context) {
    for _, rule := range am.rules {
        alerts := am.evaluateRule(ctx, rule)
        for _, alert := range alerts {
            // 检查是否是重复告警
            if am.history.IsDuplicate(alert) {
                continue
            }

            // 记录告警历史
            am.history.Add(alert)

            // 发送告警通知
            for _, notifier := range am.notifiers {
                go func(n Notifier, a *Alert) {
                    if err := n.Notify(ctx, a); err != nil {
                        // 记录发送失败的错误
                        log.Printf("Failed to send alert: %v", err)
                    }
                }(notifier, alert)
            }
        }
    }
}

// 告警历史记录
type AlertHistory struct {
    alerts map[string]*Alert
    mu     sync.RWMutex
}

func NewAlertHistory() *AlertHistory {
    return &AlertHistory{
        alerts: make(map[string]*Alert),
    }
}

func (h *AlertHistory) Add(alert *Alert) {
    h.mu.Lock()
    defer h.mu.Unlock()
    h.alerts[alert.ID] = alert
}

func (h *AlertHistory) IsDuplicate(alert *Alert) bool {
    h.mu.RLock()
    defer h.mu.RUnlock()
    
    if prev, exists := h.alerts[alert.ID]; exists {
        // 检查是否在静默期内(1小时内的相同告警被视为重复)
        return time.Since(prev.Timestamp) < time.Hour
    }
    return false
}

// 钉钉通知器实现
type DingTalkNotifier struct {
    webhook string
}

func NewDingTalkNotifier(webhook string) *DingTalkNotifier {
    return &DingTalkNotifier{webhook: webhook}
}

func (d *DingTalkNotifier) Notify(ctx context.Context, alert *Alert) error {
    message := map[string]interface{}{
        "msgtype": "markdown",
        "markdown": map[string]string{
            "title": fmt.Sprintf("【%s】%s", alert.Level, alert.Name),
            "text": fmt.Sprintf("### %s\n\n"+
                "> **服务**: %s\n\n"+
                "> **级别**: %s\n\n"+
                "> **详情**: %s\n\n"+
                "> **时间**: %s\n\n"+
                "> **值**: %.2f (阈值: %.2f)\n\n",
                alert.Name,
                alert.ServiceName,
                alert.Level,
                alert.Message,
                alert.Timestamp.Format("2006-01-02 15:04:05"),
                alert.Value,
                alert.Threshold,
            ),
        },
    }

    payload, err := json.Marshal(message)
    if err != nil {
        return err
    }

    resp, err := http.Post(d.webhook, "application/json", bytes.NewBuffer(payload))
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    if resp.StatusCode != http.StatusOK {
        return fmt.Errorf("dingtalk notification failed: %d", resp.StatusCode)
    }

    return nil
}

5. 监控大盘设计

让我们为以上功能设计一个完整的监控大盘:
在这里插入图片描述

6. 系统集成示例

下面是一个完整的系统集成示例:

// main.go
package main

import (
    "context"
    "log"
    "net/http"
    "time"
)

func main() {
    // 初始化监控收集器
    collector := metrics.NewCollector("example_service")

    // 初始化追踪器
    tracer, err := tracing.NewTracer("example_service", "http://jaeger:14268/api/traces")
    if err != nil {
        log.Fatal(err)
    }

    // 初始化日志系统
    logger, err := logging.NewLogger("example_service", "http://elasticsearch:9200")
    if err != nil {
        log.Fatal(err)
    }

    // 初始化告警管理器
    alertManager := alerting.NewAlertManager()
    
    // 添加钉钉通知器
    dingTalk := alerting.NewDingTalkNotifier("your-webhook-url")
    alertManager.AddNotifier(dingTalk)

    // 添加告警规则
    alertManager.AddRule(alerting.AlertRule{
        Name:        "High Error Rate",
        Description: "Service error rate is too high",
        Level:       alerting.AlertLevelError,
        Expression:  "rate(errors_total[5m]) > 0.1",
        Threshold:   0.1,
        Duration:    5 * time.Minute,
        Labels: map[string]string
        // main.go (续)
        {
            "service": "example_service",
            "severity": "error",
        },
        Annotations: map[string]string{
            "description": "Error rate exceeded 10% in the last 5 minutes",
            "runbook": "https://wiki.example.com/runbook/high-error-rate",
        },
    })

    alertManager.AddRule(alerting.AlertRule{
        Name:        "High Latency",
        Description: "Service latency is too high",
        Level:       alerting.AlertLevelWarning,
        Expression:  "histogram_quantile(0.95, rate(http_request_duration_seconds[5m])) > 0.5",
        Threshold:   0.5,
        Duration:    5 * time.Minute,
        Labels: map[string]string{
            "service": "example_service",
            "severity": "warning",
        },
    })

    // 创建HTTP处理器
    mux := http.NewServeMux()

    // 注册指标采集端点
    mux.Handle("/metrics", promhttp.Handler())

    // 注册健康检查端点
    mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
        w.WriteHeader(http.StatusOK)
        w.Write([]byte("OK"))
    })

    // 创建示例API端点
    mux.HandleFunc("/api/example", func(w http.ResponseWriter, r *http.Request) {
        // 记录请求开始时间
        start := time.Now()

        // 创建span
        ctx, span := tracer.StartSpan(r.Context(), "example_api")
        defer span.End()

        // 记录正在处理的请求
        collector.TrackInFlightRequests(r.Method, 1)
        defer collector.TrackInFlightRequests(r.Method, -1)

        // 模拟业务处理
        time.Sleep(time.Millisecond * 100)

        // 记录业务指标
        collector.RecordBusinessOperation("example_api", "success")

        // 计算处理时间
        duration := time.Since(start)

        // 记录请求指标
        collector.RecordRequest(r.Method, "/api/example", http.StatusOK, duration)

        // 记录日志
        logger.Info(ctx, "API request processed",
            zap.String("method", r.Method),
            zap.String("path", "/api/example"),
            zap.Duration("duration", duration),
        )

        w.WriteHeader(http.StatusOK)
        w.Write([]byte("Success"))
    })

    // 创建带有中间件的服务器
    server := &http.Server{
        Addr: ":8080",
        Handler: chainMiddleware(mux,
            metricsMiddleware(collector),
            traceMiddleware(tracer),
            logMiddleware(logger),
        ),
    }

    // 启动告警管理器
    ctx := context.Background()
    go alertManager.Start(ctx)

    // 启动资源监控
    go monitorResources(collector)

    // 启动服务器
    log.Printf("Server starting on :8080")
    if err := server.ListenAndServe(); err != nil {
        log.Fatal(err)
    }
}

// 中间件链接器
func chainMiddleware(handler http.Handler, middlewares ...func(http.Handler) http.Handler) http.Handler {
    for _, middleware := range middlewares {
        handler = middleware(handler)
    }
    return handler
}

// 指标中间件
func metricsMiddleware(collector *metrics.Collector) func(http.Handler) http.Handler {
    return func(next http.Handler) http.Handler {
        return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
            start := time.Now()
            
            // 包装ResponseWriter以捕获状态码
            ww := &responseWriter{w: w, status: http.StatusOK}
            
            next.ServeHTTP(ww, r)
            
            // 记录请求指标
            duration := time.Since(start)
            collector.RecordRequest(r.Method, r.URL.Path, ww.status, duration)
        })
    }
}

// 追踪中间件
func traceMiddleware(tracer *tracing.Tracer) func(http.Handler) http.Handler {
    return func(next http.Handler) http.Handler {
        return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
            ctx, span := tracer.StartSpan(r.Context(), "http_request",
                trace.WithAttributes(
                    attribute.String("http.method", r.Method),
                    attribute.String("http.url", r.URL.String()),
                ),
            )
            defer span.End()

            next.ServeHTTP(w, r.WithContext(ctx))
        })
    }
}

// 日志中间件
func logMiddleware(logger *logging.Logger) func(http.Handler) http.Handler {
    return func(next http.Handler) http.Handler {
        return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
            start := time.Now()
            
            ww := &responseWriter{w: w, status: http.StatusOK}
            
            next.ServeHTTP(ww, r)
            
            // 记录请求日志
            logger.Info(r.Context(), "HTTP Request",
                zap.String("method", r.Method),
                zap.String("path", r.URL.Path),
                zap.Int("status", ww.status),
                zap.Duration("duration", time.Since(start)),
            )
        })
    }
}

// ResponseWriter包装器
type responseWriter struct {
    w      http.ResponseWriter
    status int
}

func (w *responseWriter) Header() http.Header {
    return w.w.Header()
}

func (w *responseWriter) Write(b []byte) (int, error) {
    return w.w.Write(b)
}

func (w *responseWriter) WriteHeader(statusCode int) {
    w.status = statusCode
    w.w.WriteHeader(statusCode)
}

// 资源监控
func monitorResources(collector *metrics.Collector) {
    ticker := time.NewTicker(15 * time.Second)
    defer ticker.Stop()

    for {
        <-ticker.C
        
        // 更新CPU使用率
        cpuUsage := getCPUUsage()
        collector.UpdateCPUUsage("total", cpuUsage)
        
        // 更新内存使用情况
        var m runtime.MemStats
        runtime.ReadMemStats(&m)
        collector.UpdateMemoryUsage("heap", float64(m.Alloc))
        collector.UpdateMemoryUsage("system", float64(m.Sys))
        
        // 更新goroutine数量
        collector.UpdateGoroutineCount(int64(runtime.NumGoroutine()))
    }
}

func getCPUUsage() float64 {
    var cpuUsage float64
    percentage, err := cpu.Percent(time.Second, false)
    if err == nil && len(percentage) > 0 {
        cpuUsage = percentage[0]
    }
    return cpuUsage
}

7. 监控指标说明

让我们通过一个表格来总结需要监控的关键指标:

指标类型指标名称说明告警阈值
性能指标QPS每秒请求数>1000
性能指标P95延迟95%请求的响应时间>500ms
性能指标P99延迟99%请求的响应时间>1s
错误指标错误率请求错误率>1%
错误指标5xx错误服务器错误数>10/分钟
资源指标CPU使用率CPU使用百分比>80%
资源指标内存使用率内存使用百分比>80%
资源指标Goroutine数量协程数量>10000
业务指标成功率业务操作成功率<99%
业务指标处理延迟业务处理时间>1s

8. 链路追踪设计

这个完整的服务监控与追踪系统实现了以下功能:

  1. 监控指标采集

    • HTTP请求监控
    • 系统资源监控
    • 业务指标监控
    • 错误监控
  2. 分布式链路追踪

    • 调用链路记录
    • 性能分析
    • 错误定位
    • 服务依赖分析
  3. 日志处理系统

    • 多级别日志
    • 结构化日志
    • 日志聚合
    • 日志查询
  4. 告警系统

    • 多级别告警
    • 告警规则配置
    • 告警通知
    • 告警历史记录

怎么样今天的内容还满意吗?再次感谢观众老爷的观看,关注GZH:凡人的AI工具箱,回复666,送您价值199的AI大礼包。最后,祝您早日实现财务自由,还请给个赞,谢谢!

;