Bootstrap

(三)数仓之线上教育平台

用户做题模块需求

1. QzWebsite.log 做题网站日志数据
{
	"createtime": "2019-07-22 11:47:18",  //创建时间
	"creator": "admin",   //创建者
	"dn": "webA",   //网站分区
	"domain": "-",
	"dt": "20190722",  //日期分区
	"multicastgateway": "-",
	"multicastport": "-",
	"multicastserver": "-",
	"sequence": "-",
	"siteid": 0,   //网站id
	"sitename": "sitename0",  //网站名称
	"status": "-",   
	"templateserver": "-"
}
2.	QzSiteCourse.log  网站课程日志数据
{
	"boardid": 64,  //课程模板id
	"coursechapter": "-",  
	"courseid": 66,  //课程id
	"createtime": "2019-07-22 11:43:32",  //创建时间
	"creator": "admin",   //创建者
	"dn": "webA",   //网站分区
	"dt": "20190722",  //日期分区
	"helpparperstatus": "-",
	"sequence": "-",
	"servertype": "-",
	"showstatus": "-",
	"sitecourseid": 2,  //网站课程id
	"sitecoursename": "sitecoursename2",  //网站课程名称
	"siteid": 77,  //网站id
	"status": "-"
}
3.	QzQuestionType.log 题目类型数据
{
	"createtime": "2019-07-22 10:42:47",   //创建时间
	"creator": "admin",    //创建者
	"description": "-",
	"dn": "webA",   //网站分区
	"dt": "20190722",  //日期分区
	"papertypename": "-",
	"questypeid": 0,  //做题类型id
	"quesviewtype": 0,
	"remark": "-",
	"sequence": "-",
	"splitscoretype": "-",
	"status": "-",
	"viewtypename": "viewtypename0"
}
4.	QzQuestion.log 做题日志数据
{
	"analysis": "-",
	"answer": "-",
	"attanswer": "-",
	"content": "-",
	"createtime": "2019-07-22 11:33:46",  //创建时间
	"creator": "admin",  //创建者
	"difficulty": "-",
	"dn": "webA",   //网站分区
	"dt": "20190722",  //日期分区
	"lecture": "-",
	"limitminute": "-",
	"modifystatus": "-",
	"optnum": 8,
	"parentid": 57,
	"quesskill": "-",
	"questag": "-",
	"questionid": 0,  //题id
	"questypeid": 57, //题目类型id
	"quesviewtype": 44,  
	"score": 24.124501582742543, //题的分数
	"splitscore": 0.0,
	"status": "-",
	"vanalysisaddr": "-",
	"vdeoaddr": "-"
}
5.	QzPointQuestion.log 做题知识点关联数据
{
	"createtime": "2019-07-22 09:16:46",   //创建时间
	"creator": "admin",  //创建者
	"dn": "webA",  //网站分区
	"dt": "20190722", //日期分区
	"pointid": 0,  //知识点id
	"questionid": 0, //题id
	"questype": 0  
}
6.	QzPoint.log 知识点数据日志
{
	"chapter": "-",   //所属章节
	"chapterid": 0,  //章节id
	"courseid": 0,  //课程id
	"createtime": "2019-07-22 09:08:52", //创建时间
	"creator": "admin",  //创建者
	"dn": "webA",  //网站分区
	"dt": "20190722",  //日期分区
	"excisenum": 73,
	"modifystatus": "-",
	"pointdescribe": "-",
	"pointid": 0,  //知识点id
	"pointlevel": "9",  //知识点级别
	"pointlist": "-",
	"pointlistid": 82,   //知识点列表id
	"pointname": "pointname0",  //知识点名称
	"pointnamelist": "-",
	"pointyear": "2019", //知识点所属年份
	"remid": "-",
	"score": 83.86880766562163,  //知识点分数
	"sequece": "-",
	"status": "-",
	"thought": "-",
	"typelist": "-"
}
7.	QzPaperView.log 试卷视图数据
{
	"contesttime": "2019-07-22 19:02:19",
	"contesttimelimit": "-",
	"createtime": "2019-07-22 19:02:19",  //创建时间
	"creator": "admin",  //创建者
	"dayiid": 94,
	"description": "-",
	"dn": "webA", //网站分区
	"downurl": "-",
	"dt": "20190722",  //日期分区
	"explainurl": "-",
	"iscontest": "-",
	"modifystatus": "-",
	"openstatus": "-",
	"paperdifficult": "-",
	"paperid": 83,   //试卷id
	"paperparam": "-",
	"papertype": "-",
	"paperuse": "-",
	"paperuseshow": "-",
	"paperviewcatid": 1,
	"paperviewid": 0,  //试卷视图id
	"paperviewname": "paperviewname0",  //试卷视图名称 
	"testreport": "-"
}
8.	QzPaper.log 做题试卷日志数据
{
	"chapter": "-",   //章节
	"chapterid": 33,  //章节id
	"chapterlistid": 69, //所属章节列表id
	"courseid": 72, //课程id
	"createtime": "2019-07-22 19:14:27", //创建时间
	"creator": "admin",  //创建者
	"dn": "webA",  //网站分区
	"dt": "20190722",  //日期分区
	"papercatid": 92,  
	"paperid": 0,  //试卷id
	"papername": "papername0",  //试卷名称
	"paperyear": "2019",  //试卷所属年份
	"status": "-",
	"suitnum": "-",
	"totalscore": 93.16710017696484  //试卷总分
}
9.	 QzMemberPaperQuestion.log 学员做题详情数据
{
	"chapterid": 33, //章节id
	"dn": "webA", //网站分区
	"dt": "20190722", //日期分区
	"istrue": "-",
	"lasttime": "2019-07-22 11:02:30",
	"majorid": 77, //主修id
	"opertype": "-",
	"paperid": 91,//试卷id
	"paperviewid": 37, //试卷视图id
	"question_answer": 1, //做题结果(0错误 1正确)
	"questionid": 94, //题id
	"score": 76.6941793631127,  //学员成绩分数
	"sitecourseid": 1, //网站课程id
	"spendtime": 4823, //所用时间单位(秒)
	"useranswer": "-",
	"userid": 0 //用户id
}
10.	 QzMajor.log 主修数据
{
	"businessid": 41, //主修行业id
	"columm_sitetype": "-",
	"createtime": "2019-07-22 11:10:20", //创建时间
	"creator": "admin",  //创建者
	"dn": "webA",  //网站分区
	"dt": "20190722",  //日期分区
	"majorid": 1,  //主修id
	"majorname": "majorname1",  //主修名称
	"sequence": "-",
	"shortname": "-",
	"siteid": 24, //网站id
	"status": "-"
}
11.	QzCourseEduSubject.log 课程辅导数据
{
	"courseeduid": 0, //课程辅导id
	"courseid": 0,  //课程id
	"createtime": "2019-07-22 11:14:43", //创建时间
	"creator": "admin",  //创建者
	"dn": "webA",  //网站分区
	"dt": "20190722",  //日期分区
	"edusubjectid": 44, //辅导科目id
	"majorid": 38  //主修id
}
12.	QzCourse.log 题库课程数据
{
	"chapterlistid": 45, //章节列表id
	"courseid": 0,  //课程id
	"coursename": "coursename0",  //课程名称 
	"createtime": "2019-07-22 11:08:15", //创建时间
	"creator": "admin",  //创建者
	"dn": "webA",  //网站分区 
	"dt": "20190722",  //日期分区
	"isadvc": "-",
	"majorid": 39,  //主修id
	"pointlistid": 92,  //知识点列表id
	"sequence": "8128f2c6-2430-42c7-9cb4-787e52da2d98",
	"status": "-"
}
13.	QzChapterList.log 章节列表数据
{
	"chapterallnum": 0,  //章节总个数
	"chapterlistid": 0,   //章节列表id
	"chapterlistname": "chapterlistname0",  //章节列表名称
	"courseid": 71,  //课程id
	"createtime": "2019-07-22 16:22:19", //创建时间
	"creator": "admin", //创建者
	"dn": "webA",  //网站分区
	"dt": "20190722",  //日期分区
	"status": "-"
}
14.	QzChapter.log 章节数据 
{
	"chapterid": 0,  //章节id
	"chapterlistid": 0,  //所属章节列表id
	"chaptername": "chaptername0",  //章节名称
	"chapternum": 10,  //章节个数
	"courseid": 61,  //课程id
	"createtime": "2019-07-22 16:37:24",  //创建时间
	"creator": "admin",  //创建者
	"dn": "webA",  //网站分区
	"dt": "20190722",  //日期分区
	"outchapterid": 0,
	"sequence": "-",
	"showstatus": "-",
	"status": "-"
}
15.	 QzCenterPaper.log 试卷主题关联数据
{
	"centerid": 55,   //主题id
	"createtime": "2019-07-22 10:48:30", //创建时间
	"creator": "admin",  //创建者
	"dn": "webA",  //网站分区
	"dt": "20190722",  //日期分区
	"openstatus": "-",
	"paperviewid": 2,  //视图id
	"sequence": "-"
}
16.	 QzCenter.log 主题数据
{
	"centerid": 0,  //主题id
	"centername": "centername0", //主题名称
	"centerparam": "-",
	"centertype": "3",  //主题类型 
	"centerviewtype": "-",
	"centeryear": "2019",  //主题年份
	"createtime": "2019-07-22 19:13:09", //创建时间
	"creator": "-",
	"description": "-",
	"dn": "webA",
	"dt": "20190722", //日期分区
	"openstatus": "1",
	"provideuser": "-",
	"sequence": "-",
	"stage": "-"
}

Centerid:主题id centername:主题名称 centertype:主题类型 centeryear:主题年份
createtime:创建时间 dn:网站分区 dt:日期分区

17.	QzBusiness.log 所属行业数据
{
	"businessid": 0,  //行业id
	"businessname": "bsname0",  //行业名称
	"createtime": "2019-07-22 10:40:54",  //创建时间
	"creator": "admin",  //创建者
	"dn": "webA", //网站分区
	"dt": "20190722",  //日期分区
	"sequence": "-",
	"siteid": 1,   //所属网站id
	"status": "-"
}

模拟数据采集上传数据

与上一个功能类似

解析数据

 需求1:使用spark解析ods层数据,将数据存入到对应的hive表中,要求对所有score 分数字段进行保留1位小数并且四舍五入。

维度退化

需求2:基于dwd层基础表数据,需要对表进行维度退化进行表聚合,聚合成dws.dws_qz_chapter(章节维度表),
dws.dws_qz_course(课程维度表),dws.dws_qz_major(主修维度表),dws.dws_qz_paper(试卷维度表),
dws.dws_qz_question(题目维度表),使用spark sql和dataframe api操作

dws.dws_qz_chapte : 4张表join dwd.dwd_qz_chapter inner join dwd.qz_chapter_list join条件:chapterlistid和dn ,inner join dwd.dwd_qz_point join条件:chapterid和dn, inner join dwd.dwd_qz_point_question join条件:pointid和dn
在这里插入图片描述
dws.dws_qz_course:3张表join dwd.dwd_qz_site_course inner join dwd.qz_course join条件:courseid和dn , inner join dwd.qz_course_edusubject join条件:courseid和dn
在这里插入图片描述
dws.dws_qz_major:3张表join dwd.dwd_qz_major inner join dwd.dwd_qz_website join条件:siteid和dn , inner join dwd.dwd_qz_business join条件:businessid和dn
在这里插入图片描述
dws.dws_qz_paper: 4张表join qz_paperview left join qz_center join 条件:paperviewid和dn,
left join qz_center join 条件:centerid和dn, inner join qz_paper join条件:paperid和dn
在这里插入图片描述
dws.dws_qz_question:2表join qz_quesiton inner join qz_questiontype join条件:
questypeid 和dn
在这里插入图片描述

宽表合成

需求3:基于dws.dws_qz_chapter、dws.dws_qz_course、dws.dws_qz_major、dws.dws_qz_paper、dws.dws_qz_question、dwd.dwd_qz_member_paper_question 合成宽表dw.user_paper_detail,使用spark sql和dataframe api操作

dws.user_paper_detail: dwd_qz_member_paper_question inner join dws_qz_chapter join条件:chapterid 和dn ,inner join dws_qz_course join条件:sitecourseid和dn , inner join dws_qz_major join条件majorid和dn, inner join dws_qz_paper 条件paperviewid和dn , inner join dws_qz_question 条件questionid和

报表层各指标统计

需求4:基于宽表统计各试卷平均耗时、平均分,先使用Spark Sql 完成指标统计,再使用Spark DataFrame Api。
 

在这里插入图片描述

需求5:统计各试卷最高分、最低分,先使用Spark Sql 完成指标统计,再使用Spark DataFrame Api。

在这里插入图片描述

需求6:按试卷分组统计每份试卷的前三用户详情,先使用Spark Sql 完成指标统计,再使用Spark DataFrame Api。
 

在这里插入图片描述

需求7:按试卷分组统计每份试卷的倒数前三的用户详情,先使用Spark Sql 完成指标统计,再使用Spark DataFrame Api。

在这里插入图片描述

需求8:统计各试卷各分段的用户id,分段有0-20,20-40,40-6060-80,80-100

在这里插入图片描述

需求9:统计试卷未及格的人数,及格的人数,试卷的及格率 及格分数60

在这里插入图片描述

需求10:统计各题的错误数,正确数,错题率

在这里插入图片描述

将数据导入mysql

需求11:统计指标数据导入到ads层后,通过datax将ads层数据导入到mysql中

售课模块功能需求

原始数据格式及字段含义

1.salecourse.log 售课基本数据
{
	"chapterid": 2,    //章节id
	"chaptername": "chaptername2", //章节名称
	"courseid": 0,  //课程id
	"coursemanager": "admin",   //课程管理员
	"coursename": "coursename0",  //课程名称
	"dn": "webA",  //网站分区
	"dt": "20190722",  //日期分区
	"edusubjectid": 7,  //辅导科目id
	"edusubjectname": "edusubjectname7",  //辅导科目名称
	"majorid": 9,  //主修id
	"majorname": "majorname9",  //主修名称
	"money": "100",   //课程价格
	"pointlistid": 9,  //知识点列表id
	"status": "-",   //状态
	"teacherid": 8,  //老师id
	"teachername": "teachername8"  //老师名称
}
2. courseshoppingcart.log 课程购物车信息
{
	"courseid": 9830,  //课程id
	"coursename": "coursename9830", //课程名称
	"createtime": "2019-07-22 00:00:00", //创建时间
	"discount": "8",  //折扣
	"dn": "webA",  //网站分区
	"dt": "20190722",  //日期分区
	"orderid": "odid-0", //订单id
	"sellmoney": "80" //购物车金额
}
3.coursepay.log 课程支付订单信息
{
	"createitme": "2019-07-22 00:00:00", //创建时间
	"discount": "8",  //支付折扣
	"dn": "webA",  //网站分区
	"dt": "20190722", //日期分区
	"orderid": "odid-0", //订单id
	"paymoney": "80" //支付金额
}

模拟数据采集上传数据

Hadoop dfs -put salecourse.log /user/atguigu/ods
Hadoop dfs -put coursepay.log /user/atguigu/ods
Hadoop dfs -put courseshoppingcart.log /user/atguigu/ods

解析数据导入到对应hive表中

在这里插入图片描述

关联join聚合表

dwd.dwd_sale_course 与dwd.dwd_course_shopping_cart join条件:courseid、dn、dt
dwd.dwd_course_shopping_cart 与dwd.dwd_course_pay join条件:orderid、dn、dt

不允许丢数据,关联不上的字段为null,join之后导入dws层的表
在这里插入图片描述

最后

要求
1:通过Spark UI观察每个task的运行情况、数据量
2:解决数据倾斜问题
思考

1)第一层表哪些用overwrite合适,哪些用append合适
(2)数据过滤后,重组成需要的数据进行插入表的时候如何控制分区个数,即如何解决小文件过多问题。
(3)合成宽表时一个用户会对应多条明细支付金额数据,如何合并
(4)分区的场景和作用,为什么需要分区

;