用户做题模块需求
1. QzWebsite.log 做题网站日志数据
{
"createtime": "2019-07-22 11:47:18", //创建时间
"creator": "admin", //创建者
"dn": "webA", //网站分区
"domain": "-",
"dt": "20190722", //日期分区
"multicastgateway": "-",
"multicastport": "-",
"multicastserver": "-",
"sequence": "-",
"siteid": 0, //网站id
"sitename": "sitename0", //网站名称
"status": "-",
"templateserver": "-"
}
2. QzSiteCourse.log 网站课程日志数据
{
"boardid": 64, //课程模板id
"coursechapter": "-",
"courseid": 66, //课程id
"createtime": "2019-07-22 11:43:32", //创建时间
"creator": "admin", //创建者
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"helpparperstatus": "-",
"sequence": "-",
"servertype": "-",
"showstatus": "-",
"sitecourseid": 2, //网站课程id
"sitecoursename": "sitecoursename2", //网站课程名称
"siteid": 77, //网站id
"status": "-"
}
3. QzQuestionType.log 题目类型数据
{
"createtime": "2019-07-22 10:42:47", //创建时间
"creator": "admin", //创建者
"description": "-",
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"papertypename": "-",
"questypeid": 0, //做题类型id
"quesviewtype": 0,
"remark": "-",
"sequence": "-",
"splitscoretype": "-",
"status": "-",
"viewtypename": "viewtypename0"
}
4. QzQuestion.log 做题日志数据
{
"analysis": "-",
"answer": "-",
"attanswer": "-",
"content": "-",
"createtime": "2019-07-22 11:33:46", //创建时间
"creator": "admin", //创建者
"difficulty": "-",
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"lecture": "-",
"limitminute": "-",
"modifystatus": "-",
"optnum": 8,
"parentid": 57,
"quesskill": "-",
"questag": "-",
"questionid": 0, //题id
"questypeid": 57, //题目类型id
"quesviewtype": 44,
"score": 24.124501582742543, //题的分数
"splitscore": 0.0,
"status": "-",
"vanalysisaddr": "-",
"vdeoaddr": "-"
}
5. QzPointQuestion.log 做题知识点关联数据
{
"createtime": "2019-07-22 09:16:46", //创建时间
"creator": "admin", //创建者
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"pointid": 0, //知识点id
"questionid": 0, //题id
"questype": 0
}
6. QzPoint.log 知识点数据日志
{
"chapter": "-", //所属章节
"chapterid": 0, //章节id
"courseid": 0, //课程id
"createtime": "2019-07-22 09:08:52", //创建时间
"creator": "admin", //创建者
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"excisenum": 73,
"modifystatus": "-",
"pointdescribe": "-",
"pointid": 0, //知识点id
"pointlevel": "9", //知识点级别
"pointlist": "-",
"pointlistid": 82, //知识点列表id
"pointname": "pointname0", //知识点名称
"pointnamelist": "-",
"pointyear": "2019", //知识点所属年份
"remid": "-",
"score": 83.86880766562163, //知识点分数
"sequece": "-",
"status": "-",
"thought": "-",
"typelist": "-"
}
7. QzPaperView.log 试卷视图数据
{
"contesttime": "2019-07-22 19:02:19",
"contesttimelimit": "-",
"createtime": "2019-07-22 19:02:19", //创建时间
"creator": "admin", //创建者
"dayiid": 94,
"description": "-",
"dn": "webA", //网站分区
"downurl": "-",
"dt": "20190722", //日期分区
"explainurl": "-",
"iscontest": "-",
"modifystatus": "-",
"openstatus": "-",
"paperdifficult": "-",
"paperid": 83, //试卷id
"paperparam": "-",
"papertype": "-",
"paperuse": "-",
"paperuseshow": "-",
"paperviewcatid": 1,
"paperviewid": 0, //试卷视图id
"paperviewname": "paperviewname0", //试卷视图名称
"testreport": "-"
}
8. QzPaper.log 做题试卷日志数据
{
"chapter": "-", //章节
"chapterid": 33, //章节id
"chapterlistid": 69, //所属章节列表id
"courseid": 72, //课程id
"createtime": "2019-07-22 19:14:27", //创建时间
"creator": "admin", //创建者
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"papercatid": 92,
"paperid": 0, //试卷id
"papername": "papername0", //试卷名称
"paperyear": "2019", //试卷所属年份
"status": "-",
"suitnum": "-",
"totalscore": 93.16710017696484 //试卷总分
}
9. QzMemberPaperQuestion.log 学员做题详情数据
{
"chapterid": 33, //章节id
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"istrue": "-",
"lasttime": "2019-07-22 11:02:30",
"majorid": 77, //主修id
"opertype": "-",
"paperid": 91,//试卷id
"paperviewid": 37, //试卷视图id
"question_answer": 1, //做题结果(0错误 1正确)
"questionid": 94, //题id
"score": 76.6941793631127, //学员成绩分数
"sitecourseid": 1, //网站课程id
"spendtime": 4823, //所用时间单位(秒)
"useranswer": "-",
"userid": 0 //用户id
}
10. QzMajor.log 主修数据
{
"businessid": 41, //主修行业id
"columm_sitetype": "-",
"createtime": "2019-07-22 11:10:20", //创建时间
"creator": "admin", //创建者
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"majorid": 1, //主修id
"majorname": "majorname1", //主修名称
"sequence": "-",
"shortname": "-",
"siteid": 24, //网站id
"status": "-"
}
11. QzCourseEduSubject.log 课程辅导数据
{
"courseeduid": 0, //课程辅导id
"courseid": 0, //课程id
"createtime": "2019-07-22 11:14:43", //创建时间
"creator": "admin", //创建者
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"edusubjectid": 44, //辅导科目id
"majorid": 38 //主修id
}
12. QzCourse.log 题库课程数据
{
"chapterlistid": 45, //章节列表id
"courseid": 0, //课程id
"coursename": "coursename0", //课程名称
"createtime": "2019-07-22 11:08:15", //创建时间
"creator": "admin", //创建者
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"isadvc": "-",
"majorid": 39, //主修id
"pointlistid": 92, //知识点列表id
"sequence": "8128f2c6-2430-42c7-9cb4-787e52da2d98",
"status": "-"
}
13. QzChapterList.log 章节列表数据
{
"chapterallnum": 0, //章节总个数
"chapterlistid": 0, //章节列表id
"chapterlistname": "chapterlistname0", //章节列表名称
"courseid": 71, //课程id
"createtime": "2019-07-22 16:22:19", //创建时间
"creator": "admin", //创建者
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"status": "-"
}
14. QzChapter.log 章节数据
{
"chapterid": 0, //章节id
"chapterlistid": 0, //所属章节列表id
"chaptername": "chaptername0", //章节名称
"chapternum": 10, //章节个数
"courseid": 61, //课程id
"createtime": "2019-07-22 16:37:24", //创建时间
"creator": "admin", //创建者
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"outchapterid": 0,
"sequence": "-",
"showstatus": "-",
"status": "-"
}
15. QzCenterPaper.log 试卷主题关联数据
{
"centerid": 55, //主题id
"createtime": "2019-07-22 10:48:30", //创建时间
"creator": "admin", //创建者
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"openstatus": "-",
"paperviewid": 2, //视图id
"sequence": "-"
}
16. QzCenter.log 主题数据
{
"centerid": 0, //主题id
"centername": "centername0", //主题名称
"centerparam": "-",
"centertype": "3", //主题类型
"centerviewtype": "-",
"centeryear": "2019", //主题年份
"createtime": "2019-07-22 19:13:09", //创建时间
"creator": "-",
"description": "-",
"dn": "webA",
"dt": "20190722", //日期分区
"openstatus": "1",
"provideuser": "-",
"sequence": "-",
"stage": "-"
}
Centerid:主题id centername:主题名称 centertype:主题类型 centeryear:主题年份
createtime:创建时间 dn:网站分区 dt:日期分区
17. QzBusiness.log 所属行业数据
{
"businessid": 0, //行业id
"businessname": "bsname0", //行业名称
"createtime": "2019-07-22 10:40:54", //创建时间
"creator": "admin", //创建者
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"sequence": "-",
"siteid": 1, //所属网站id
"status": "-"
}
模拟数据采集上传数据
与上一个功能类似
解析数据
需求1:使用spark解析ods层数据,将数据存入到对应的hive表中,要求对所有score 分数字段进行保留1位小数并且四舍五入。
维度退化
需求2:基于dwd层基础表数据,需要对表进行维度退化进行表聚合,聚合成dws.dws_qz_chapter(章节维度表),
dws.dws_qz_course(课程维度表),dws.dws_qz_major(主修维度表),dws.dws_qz_paper(试卷维度表),
dws.dws_qz_question(题目维度表),使用spark sql和dataframe api操作
dws.dws_qz_chapte : 4张表join dwd.dwd_qz_chapter inner join dwd.qz_chapter_list join条件:chapterlistid和dn ,inner join dwd.dwd_qz_point join条件:chapterid和dn, inner join dwd.dwd_qz_point_question join条件:pointid和dn
dws.dws_qz_course:3张表join dwd.dwd_qz_site_course inner join dwd.qz_course join条件:courseid和dn , inner join dwd.qz_course_edusubject join条件:courseid和dn
dws.dws_qz_major:3张表join dwd.dwd_qz_major inner join dwd.dwd_qz_website join条件:siteid和dn , inner join dwd.dwd_qz_business join条件:businessid和dn
dws.dws_qz_paper: 4张表join qz_paperview left join qz_center join 条件:paperviewid和dn,
left join qz_center join 条件:centerid和dn, inner join qz_paper join条件:paperid和dn
dws.dws_qz_question:2表join qz_quesiton inner join qz_questiontype join条件:
questypeid 和dn
宽表合成
需求3:基于dws.dws_qz_chapter、dws.dws_qz_course、dws.dws_qz_major、dws.dws_qz_paper、dws.dws_qz_question、dwd.dwd_qz_member_paper_question 合成宽表dw.user_paper_detail,使用spark sql和dataframe api操作
dws.user_paper_detail: dwd_qz_member_paper_question inner join dws_qz_chapter join条件:chapterid 和dn ,inner join dws_qz_course join条件:sitecourseid和dn , inner join dws_qz_major join条件majorid和dn, inner join dws_qz_paper 条件paperviewid和dn , inner join dws_qz_question 条件questionid和
报表层各指标统计
需求4:基于宽表统计各试卷平均耗时、平均分,先使用Spark Sql 完成指标统计,再使用Spark DataFrame Api。
需求5:统计各试卷最高分、最低分,先使用Spark Sql 完成指标统计,再使用Spark DataFrame Api。
需求6:按试卷分组统计每份试卷的前三用户详情,先使用Spark Sql 完成指标统计,再使用Spark DataFrame Api。
需求7:按试卷分组统计每份试卷的倒数前三的用户详情,先使用Spark Sql 完成指标统计,再使用Spark DataFrame Api。
需求8:统计各试卷各分段的用户id,分段有0-20,20-40,40-60,60-80,80-100
需求9:统计试卷未及格的人数,及格的人数,试卷的及格率 及格分数60
需求10:统计各题的错误数,正确数,错题率
将数据导入mysql
需求11:统计指标数据导入到ads层后,通过datax将ads层数据导入到mysql中
售课模块功能需求
原始数据格式及字段含义
1.salecourse.log 售课基本数据
{
"chapterid": 2, //章节id
"chaptername": "chaptername2", //章节名称
"courseid": 0, //课程id
"coursemanager": "admin", //课程管理员
"coursename": "coursename0", //课程名称
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"edusubjectid": 7, //辅导科目id
"edusubjectname": "edusubjectname7", //辅导科目名称
"majorid": 9, //主修id
"majorname": "majorname9", //主修名称
"money": "100", //课程价格
"pointlistid": 9, //知识点列表id
"status": "-", //状态
"teacherid": 8, //老师id
"teachername": "teachername8" //老师名称
}
2. courseshoppingcart.log 课程购物车信息
{
"courseid": 9830, //课程id
"coursename": "coursename9830", //课程名称
"createtime": "2019-07-22 00:00:00", //创建时间
"discount": "8", //折扣
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"orderid": "odid-0", //订单id
"sellmoney": "80" //购物车金额
}
3.coursepay.log 课程支付订单信息
{
"createitme": "2019-07-22 00:00:00", //创建时间
"discount": "8", //支付折扣
"dn": "webA", //网站分区
"dt": "20190722", //日期分区
"orderid": "odid-0", //订单id
"paymoney": "80" //支付金额
}
模拟数据采集上传数据
Hadoop dfs -put salecourse.log /user/atguigu/ods
Hadoop dfs -put coursepay.log /user/atguigu/ods
Hadoop dfs -put courseshoppingcart.log /user/atguigu/ods
解析数据导入到对应hive表中
关联join聚合表
dwd.dwd_sale_course 与dwd.dwd_course_shopping_cart join条件:courseid、dn、dt
dwd.dwd_course_shopping_cart 与dwd.dwd_course_pay join条件:orderid、dn、dt
不允许丢数据,关联不上的字段为null,join之后导入dws层的表
最后
要求
1:通过Spark UI观察每个task的运行情况、数据量
2:解决数据倾斜问题
思考
(1)第一层表哪些用overwrite合适,哪些用append合适
(2)数据过滤后,重组成需要的数据进行插入表的时候如何控制分区个数,即如何解决小文件过多问题。
(3)合成宽表时一个用户会对应多条明细支付金额数据,如何合并
(4)分区的场景和作用,为什么需要分区