在上一篇文章中,我们讨论了如何构建一个智能客服Agent。今天,我想分享另一个实际项目:如何构建一个数据分析Agent。这个项目源于我们一个金融客户的真实需求 - 提升数据分析效率,加快决策速度。
从分析师的痛点说起
记得和分析师团队交流时的场景:
小张:每天要分析这么多数据,真的很耗时
小李:是啊,而且经常要写各种分析报告
我:主要在哪些环节比较耗时?
小张:数据清洗、指标计算、图表生成这些都很繁琐
我:这些正好可以用AI Agent来协助
经过需求分析,我们确定了几个核心功能:
- 智能数据清洗
- 自动特征分析
- 可视化生成
- 报告撰写
技术方案设计
首先是整体架构:
from typing import List, Dict, Any, Optional
from enum import Enum
from pydantic import BaseModel
import pandas as pd
import numpy as np
class AnalysisTask(Enum):
CLEAN = "clean"
ANALYZE = "analyze"
VISUALIZE = "visualize"
REPORT = "report"
class DataContext(BaseModel):
data_path: str
task_type: AnalysisTask
requirements: Dict[str, Any]
history: List[Dict[str, Any]]
class DataAnalyst:
def __init__(
self,
config: Dict[str, Any]
):
# 1. 初始化分析模型
self.analysis_model = AnalysisLLM(
model="gpt-4",
temperature=0.1,
context_length=8000
)
# 2. 初始化工具集
self.tools = {
"cleaner": DataCleaner(),
"analyzer": DataAnalyzer(),
"visualizer": DataVisualizer(),
"reporter": ReportGenerator()
}
# 3. 初始化数据存储
self.data_store = DataStore(
cache_dir="./cache",
max_size_gb=10
)
async def process_task(
self,
context: DataContext
) -> Dict[str, Any]:
# 1. 加载数据
data = await self._load_data(
context.data_path
)
# 2. 理解需求
requirements = await self._understand_requirements(
context.requirements
)
# 3. 生成分析方案
plan = await self._generate_plan(
data,
requirements
)
# 4. 执行分析
result = await self._execute_analysis(
data,
plan
)
return result
async def _understand_requirements(
self,
requirements: Dict[str, Any]
) -> Dict[str, Any]:
# 1. 提取分析目标
objectives = await self.analysis_model.extract_objectives(
requirements
)
# 2. 识别关键指标
metrics = await self._identify_metrics(
objectives
)
# 3. 确定分析方法
methods = await self._select_methods(
objectives,
metrics
)
return {
"objectives": objectives,
"metrics": metrics,
"methods": methods
}
数据清洗功能
首先实现数据清洗功能:
class DataCleaner:
def __init__(
self,
model: AnalysisLLM
):
self.model = model
async def clean_data(
self,
data: pd.DataFrame
) -> Dict[str, Any]:
# 1. 数据概览
profile = await self._profile_data(
data
)
# 2. 识别问题
issues = await self._identify_issues(
data,
profile
)
# 3. 执行清洗
cleaned_data = await self._perform_cleaning(
data,
issues
)
return {
"cleaned_data": cleaned_data,
"profile": profile,
"issues": issues
}
async def _identify_issues(
self,
data: pd.DataFrame,
profile: Dict[str, Any]
) -> List[Dict[str, Any]]:
issues = []
# 1. 检查缺失值
missing = await self._check_missing_values(
data
)
issues.extend(missing)
# 2. 检查异常值
outliers = await self._detect_outliers(
data
)
issues.extend(outliers)
# 3. 检查数据类型
type_issues = await self._check_data_types(
data
)
issues.extend(type_issues)
return issues
async def _perform_cleaning(
self,
data: pd.DataFrame,
issues: List[Dict[str, Any]]
) -> pd.DataFrame:
cleaned = data.copy()
for issue in issues:
# 1. 处理缺失值
if issue["type"] == "missing":
cleaned = await self._handle_missing(
cleaned,
issue
)
# 2. 处理异常值
elif issue["type"] == "outlier":
cleaned = await self._handle_outlier(
cleaned,
issue
)
# 3. 处理类型问题
elif issue["type"] == "type":
cleaned = await self._handle_type(
cleaned,
issue
)
return cleaned
特征分析功能
接下来是特征分析功能:
class DataAnalyzer:
def __init__(
self,
model: AnalysisLLM
):
self.model = model
async def analyze_features(
self,
data: pd.DataFrame,
requirements: Dict[str, Any]
) -> Dict[str, Any]:
# 1. 统计分析
stats = await self._statistical_analysis(
data
)
# 2. 特征相关性
correlations = await self._correlation_analysis(
data
)
# 3. 时间趋势
trends = await self._trend_analysis(
data
)
return {
"statistics": stats,
"correlations": correlations,
"trends": trends
}
async def _statistical_analysis(
self,
data: pd.DataFrame
) -> Dict[str, Any]:
stats = {}
# 1. 基础统计量
basic_stats = await self._calculate_basic_stats(
data
)
stats["basic"] = basic_stats
# 2. 分布分析
distribution = await self._analyze_distribution(
data
)
stats["distribution"] = distribution
# 3. 分组统计
groupby = await self._group_statistics(
data
)
stats["groupby"] = groupby
return stats
async def _correlation_analysis(
self,
data: pd.DataFrame
) -> Dict[str, Any]:
# 1. 计算相关系数
corr_matrix = await self._calculate_correlations(
data
)
# 2. 特征重要性
importance = await self._feature_importance(
data
)
# 3. 共线性检测
collinearity = await self._check_collinearity(
data
)
return {
"correlation_matrix": corr_matrix,
"feature_importance": importance,
"collinearity": collinearity
}
可视化功能
再来实现可视化功能:
class DataVisualizer:
def __init__(
self,
model: AnalysisLLM
):
self.model = model
async def create_visualizations(
self,
data: pd.DataFrame,
analysis: Dict[str, Any]
) -> Dict[str, Any]:
# 1. 选择图表类型
chart_types = await self._select_charts(
data,
analysis
)
# 2. 生成图表
charts = await self._generate_charts(
data,
chart_types
)
# 3. 优化展示
optimized = await self._optimize_display(
charts
)
return {
"charts": charts,
"layout": optimized
}
async def _select_charts(
self,
data: pd.DataFrame,
analysis: Dict[str, Any]
) -> List[Dict[str, Any]]:
charts = []
# 1. 分布图表
distribution_charts = await self._distribution_charts(
data,
analysis
)
charts.extend(distribution_charts)
# 2. 关系图表
relationship_charts = await self._relationship_charts(
data,
analysis
)
charts.extend(relationship_charts)
# 3. 趋势图表
trend_charts = await self._trend_charts(
data,
analysis
)
charts.extend(trend_charts)
return charts
async def _generate_charts(
self,
data: pd.DataFrame,
chart_types: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
charts = []
for chart_type in chart_types:
# 1. 准备数据
plot_data = await self._prepare_plot_data(
data,
chart_type
)
# 2. 设置样式
style = await self._set_chart_style(
chart_type
)
# 3. 生成图表
chart = await self._plot_chart(
plot_data,
chart_type,
style
)
charts.append({
"type": chart_type,
"data": plot_data,
"style": style,
"chart": chart
})
return charts
报告生成功能
最后是报告生成功能:
class ReportGenerator:
def __init__(
self,
model: AnalysisLLM
):
self.model = model
async def generate_report(
self,
data: pd.DataFrame,
analysis: Dict[str, Any],
visualizations: Dict[str, Any]
) -> Dict[str, Any]:
# 1. 提取要点
key_points = await self._extract_key_points(
analysis
)
# 2. 生成结构
structure = await self._create_structure(
key_points
)
# 3. 撰写内容
content = await self._write_content(
structure,
analysis,
visualizations
)
return {
"key_points": key_points,
"structure": structure,
"content": content
}
async def _extract_key_points(
self,
analysis: Dict[str, Any]
) -> List[Dict[str, Any]]:
points = []
# 1. 统计发现
statistical_points = await self._extract_statistical_points(
analysis["statistics"]
)
points.extend(statistical_points)
# 2. 相关性发现
correlation_points = await self._extract_correlation_points(
analysis["correlations"]
)
points.extend(correlation_points)
# 3. 趋势发现
trend_points = await self._extract_trend_points(
analysis["trends"]
)
points.extend(trend_points)
return points
async def _write_content(
self,
structure: Dict[str, Any],
analysis: Dict[str, Any],
visualizations: Dict[str, Any]
) -> Dict[str, str]:
content = {}
# 1. 写摘要
content["summary"] = await self._write_summary(
structure,
analysis
)
# 2. 写主体
content["body"] = await self._write_body(
structure,
analysis,
visualizations
)
# 3. 写结论
content["conclusion"] = await self._write_conclusion(
structure,
analysis
)
return content
实际效果
经过两个月的使用,这个数据分析Agent带来了显著的效率提升:
时间节省
- 数据清洗时间减少70%
- 分析流程加快50%
- 报告生成效率提升60%
质量提升
- 分析更全面
- 图表更专业
- 报告更规范
能力扩展
- 支持更多数据源
- 分析方法更丰富
- 可视化更灵活
实践心得
在开发这个数据分析Agent的过程中,我总结了几点经验:
需求导向
- 理解分析目标
- 把握重点指标
- 注重实用性
方法系统
- 分析方法要系统
- 工具选择要合理
- 流程设计要清晰
结果可用
- 结论要有洞见
- 图表要易理解
- 报告要实用
写在最后
一个好的数据分析Agent不仅要会算数据,更要懂业务含义,能够帮助用户发现数据背后的价值。它就像一个经验丰富的数据分析师,在合适的时候给出恰当的分析建议。
在下一篇文章中,我会讲解如何开发一个文档助手Agent。如果你对数据分析Agent的开发有什么想法,欢迎在评论区交流。