Bootstrap

构建一个数据分析Agent:提升分析效率的实践

在上一篇文章中,我们讨论了如何构建一个智能客服Agent。今天,我想分享另一个实际项目:如何构建一个数据分析Agent。这个项目源于我们一个金融客户的真实需求 - 提升数据分析效率,加快决策速度。

从分析师的痛点说起

记得和分析师团队交流时的场景:

小张:每天要分析这么多数据,真的很耗时
小李:是啊,而且经常要写各种分析报告
我:主要在哪些环节比较耗时?
小张:数据清洗、指标计算、图表生成这些都很繁琐
我:这些正好可以用AI Agent来协助

经过需求分析,我们确定了几个核心功能:

  1. 智能数据清洗
  2. 自动特征分析
  3. 可视化生成
  4. 报告撰写

技术方案设计

首先是整体架构:

from typing import List, Dict, Any, Optional
from enum import Enum
from pydantic import BaseModel
import pandas as pd
import numpy as np

class AnalysisTask(Enum):
    CLEAN = "clean"
    ANALYZE = "analyze"
    VISUALIZE = "visualize"
    REPORT = "report"

class DataContext(BaseModel):
    data_path: str
    task_type: AnalysisTask
    requirements: Dict[str, Any]
    history: List[Dict[str, Any]]

class DataAnalyst:
    def __init__(
        self,
        config: Dict[str, Any]
    ):
        # 1. 初始化分析模型
        self.analysis_model = AnalysisLLM(
            model="gpt-4",
            temperature=0.1,
            context_length=8000
        )

        # 2. 初始化工具集
        self.tools = {
            "cleaner": DataCleaner(),
            "analyzer": DataAnalyzer(),
            "visualizer": DataVisualizer(),
            "reporter": ReportGenerator()
        }

        # 3. 初始化数据存储
        self.data_store = DataStore(
            cache_dir="./cache",
            max_size_gb=10
        )

    async def process_task(
        self,
        context: DataContext
    ) -> Dict[str, Any]:
        # 1. 加载数据
        data = await self._load_data(
            context.data_path
        )

        # 2. 理解需求
        requirements = await self._understand_requirements(
            context.requirements
        )

        # 3. 生成分析方案
        plan = await self._generate_plan(
            data,
            requirements
        )

        # 4. 执行分析
        result = await self._execute_analysis(
            data,
            plan
        )

        return result

    async def _understand_requirements(
        self,
        requirements: Dict[str, Any]
    ) -> Dict[str, Any]:
        # 1. 提取分析目标
        objectives = await self.analysis_model.extract_objectives(
            requirements
        )

        # 2. 识别关键指标
        metrics = await self._identify_metrics(
            objectives
        )

        # 3. 确定分析方法
        methods = await self._select_methods(
            objectives,
            metrics
        )

        return {
            "objectives": objectives,
            "metrics": metrics,
            "methods": methods
        }

数据清洗功能

首先实现数据清洗功能:

class DataCleaner:
    def __init__(
        self,
        model: AnalysisLLM
    ):
        self.model = model

    async def clean_data(
        self,
        data: pd.DataFrame
    ) -> Dict[str, Any]:
        # 1. 数据概览
        profile = await self._profile_data(
            data
        )

        # 2. 识别问题
        issues = await self._identify_issues(
            data,
            profile
        )

        # 3. 执行清洗
        cleaned_data = await self._perform_cleaning(
            data,
            issues
        )

        return {
            "cleaned_data": cleaned_data,
            "profile": profile,
            "issues": issues
        }

    async def _identify_issues(
        self,
        data: pd.DataFrame,
        profile: Dict[str, Any]
    ) -> List[Dict[str, Any]]:
        issues = []

        # 1. 检查缺失值
        missing = await self._check_missing_values(
            data
        )
        issues.extend(missing)

        # 2. 检查异常值
        outliers = await self._detect_outliers(
            data
        )
        issues.extend(outliers)

        # 3. 检查数据类型
        type_issues = await self._check_data_types(
            data
        )
        issues.extend(type_issues)

        return issues

    async def _perform_cleaning(
        self,
        data: pd.DataFrame,
        issues: List[Dict[str, Any]]
    ) -> pd.DataFrame:
        cleaned = data.copy()

        for issue in issues:
            # 1. 处理缺失值
            if issue["type"] == "missing":
                cleaned = await self._handle_missing(
                    cleaned,
                    issue
                )

            # 2. 处理异常值
            elif issue["type"] == "outlier":
                cleaned = await self._handle_outlier(
                    cleaned,
                    issue
                )

            # 3. 处理类型问题
            elif issue["type"] == "type":
                cleaned = await self._handle_type(
                    cleaned,
                    issue
                )

        return cleaned

特征分析功能

接下来是特征分析功能:

class DataAnalyzer:
    def __init__(
        self,
        model: AnalysisLLM
    ):
        self.model = model

    async def analyze_features(
        self,
        data: pd.DataFrame,
        requirements: Dict[str, Any]
    ) -> Dict[str, Any]:
        # 1. 统计分析
        stats = await self._statistical_analysis(
            data
        )

        # 2. 特征相关性
        correlations = await self._correlation_analysis(
            data
        )

        # 3. 时间趋势
        trends = await self._trend_analysis(
            data
        )

        return {
            "statistics": stats,
            "correlations": correlations,
            "trends": trends
        }

    async def _statistical_analysis(
        self,
        data: pd.DataFrame
    ) -> Dict[str, Any]:
        stats = {}

        # 1. 基础统计量
        basic_stats = await self._calculate_basic_stats(
            data
        )
        stats["basic"] = basic_stats

        # 2. 分布分析
        distribution = await self._analyze_distribution(
            data
        )
        stats["distribution"] = distribution

        # 3. 分组统计
        groupby = await self._group_statistics(
            data
        )
        stats["groupby"] = groupby

        return stats

    async def _correlation_analysis(
        self,
        data: pd.DataFrame
    ) -> Dict[str, Any]:
        # 1. 计算相关系数
        corr_matrix = await self._calculate_correlations(
            data
        )

        # 2. 特征重要性
        importance = await self._feature_importance(
            data
        )

        # 3. 共线性检测
        collinearity = await self._check_collinearity(
            data
        )

        return {
            "correlation_matrix": corr_matrix,
            "feature_importance": importance,
            "collinearity": collinearity
        }

可视化功能

再来实现可视化功能:

class DataVisualizer:
    def __init__(
        self,
        model: AnalysisLLM
    ):
        self.model = model

    async def create_visualizations(
        self,
        data: pd.DataFrame,
        analysis: Dict[str, Any]
    ) -> Dict[str, Any]:
        # 1. 选择图表类型
        chart_types = await self._select_charts(
            data,
            analysis
        )

        # 2. 生成图表
        charts = await self._generate_charts(
            data,
            chart_types
        )

        # 3. 优化展示
        optimized = await self._optimize_display(
            charts
        )

        return {
            "charts": charts,
            "layout": optimized
        }

    async def _select_charts(
        self,
        data: pd.DataFrame,
        analysis: Dict[str, Any]
    ) -> List[Dict[str, Any]]:
        charts = []

        # 1. 分布图表
        distribution_charts = await self._distribution_charts(
            data,
            analysis
        )
        charts.extend(distribution_charts)

        # 2. 关系图表
        relationship_charts = await self._relationship_charts(
            data,
            analysis
        )
        charts.extend(relationship_charts)

        # 3. 趋势图表
        trend_charts = await self._trend_charts(
            data,
            analysis
        )
        charts.extend(trend_charts)

        return charts

    async def _generate_charts(
        self,
        data: pd.DataFrame,
        chart_types: List[Dict[str, Any]]
    ) -> List[Dict[str, Any]]:
        charts = []

        for chart_type in chart_types:
            # 1. 准备数据
            plot_data = await self._prepare_plot_data(
                data,
                chart_type
            )

            # 2. 设置样式
            style = await self._set_chart_style(
                chart_type
            )

            # 3. 生成图表
            chart = await self._plot_chart(
                plot_data,
                chart_type,
                style
            )

            charts.append({
                "type": chart_type,
                "data": plot_data,
                "style": style,
                "chart": chart
            })

        return charts

报告生成功能

最后是报告生成功能:

class ReportGenerator:
    def __init__(
        self,
        model: AnalysisLLM
    ):
        self.model = model

    async def generate_report(
        self,
        data: pd.DataFrame,
        analysis: Dict[str, Any],
        visualizations: Dict[str, Any]
    ) -> Dict[str, Any]:
        # 1. 提取要点
        key_points = await self._extract_key_points(
            analysis
        )

        # 2. 生成结构
        structure = await self._create_structure(
            key_points
        )

        # 3. 撰写内容
        content = await self._write_content(
            structure,
            analysis,
            visualizations
        )

        return {
            "key_points": key_points,
            "structure": structure,
            "content": content
        }

    async def _extract_key_points(
        self,
        analysis: Dict[str, Any]
    ) -> List[Dict[str, Any]]:
        points = []

        # 1. 统计发现
        statistical_points = await self._extract_statistical_points(
            analysis["statistics"]
        )
        points.extend(statistical_points)

        # 2. 相关性发现
        correlation_points = await self._extract_correlation_points(
            analysis["correlations"]
        )
        points.extend(correlation_points)

        # 3. 趋势发现
        trend_points = await self._extract_trend_points(
            analysis["trends"]
        )
        points.extend(trend_points)

        return points

    async def _write_content(
        self,
        structure: Dict[str, Any],
        analysis: Dict[str, Any],
        visualizations: Dict[str, Any]
    ) -> Dict[str, str]:
        content = {}

        # 1. 写摘要
        content["summary"] = await self._write_summary(
            structure,
            analysis
        )

        # 2. 写主体
        content["body"] = await self._write_body(
            structure,
            analysis,
            visualizations
        )

        # 3. 写结论
        content["conclusion"] = await self._write_conclusion(
            structure,
            analysis
        )

        return content

实际效果

经过两个月的使用,这个数据分析Agent带来了显著的效率提升:

  1. 时间节省

    • 数据清洗时间减少70%
    • 分析流程加快50%
    • 报告生成效率提升60%
  2. 质量提升

    • 分析更全面
    • 图表更专业
    • 报告更规范
  3. 能力扩展

    • 支持更多数据源
    • 分析方法更丰富
    • 可视化更灵活

实践心得

在开发这个数据分析Agent的过程中,我总结了几点经验:

  1. 需求导向

    • 理解分析目标
    • 把握重点指标
    • 注重实用性
  2. 方法系统

    • 分析方法要系统
    • 工具选择要合理
    • 流程设计要清晰
  3. 结果可用

    • 结论要有洞见
    • 图表要易理解
    • 报告要实用

写在最后

一个好的数据分析Agent不仅要会算数据,更要懂业务含义,能够帮助用户发现数据背后的价值。它就像一个经验丰富的数据分析师,在合适的时候给出恰当的分析建议。

在下一篇文章中,我会讲解如何开发一个文档助手Agent。如果你对数据分析Agent的开发有什么想法,欢迎在评论区交流。

;