数据集的获取测试结果可看:hive、spark 窗口滑动获取数据集_风路丞的博客-CSDN博客_hive sql 滑动窗口
1、近7天的数据集
思想:自身和自身做left join, 并通过on 的 条件限制,把每个id和其近7天内的id关联起来。
-- 1、建表 (15200条)
use cf;
drop table if exists taizhou;
create table taizhou(
id string,
content string,
ends string,
one string,
two string,
createdate string,
level string
) comment '台州'
row format delimited fields terminated by '\t';
load data inpath 'hdfs://tqHadoopCluster/cf/taiz.txt' into table taizhou;
-- 2、给每周数据做标记
-- 2.1 找出每条数据对应的近7天数据,且当天事件发生时间 不晚于 当前事件时间
drop table if exists taizhou_zhou;
create table taizhou_zhou as
with temp as
(select *, from_unixtime(UNIX_TIMESTAMP(createdate,'yyyy/MM/dd HH:mm'),'yyyy-MM-dd HH:mm') as time from taizhou)
select /*+ repartiton(40) */
t1.id as id,t1.time as time,t1.content,t2.id as done_id,t2.time as done_time,t2.content as done_content
from
temp t1
left join
temp t2
on t1.id!=t2.id and datediff(t1.time, t2.time) between 0 and 6 and t1.time>=t2.time;
-- select count(1) from (select id, time,count(1) as c from taizhou_zhou group by id,time) a where a.c=1;
-- 3、分割最近一周的数据(Time taken: 485.678 seconds)
-- 3.1 算法模型预测重复和相似,并获取重复、相似的”id:相似度“列表
-- 3.1.1 获取每条数据的比较对象
-- 【思路】比较对象以map<string,string>类型存储在hive表,其中key-事件唯一标识,value-事件内容
-- 【实现】str_to_map函数、collect_list函数、窗口函数
-- 【注意点】str_to_map函数是根据英文逗号来识别一个key-value对的,所以要先处理掉文本中可能存在的英文逗号,否则会出现null。
-- 【函数】collect_list(xx) 把xx组合成list。
drop table if exists taizhou_all_map_3d;
create table taizhou_all_map_3d as
select /*+ repartition(240) */
id,
content,
time,
str_to_map(
concat_ws(",",collect_list(concat_ws(':', cast(done_id as string), regexp_replace(done_content, ',', ''))))
) as contentmap
from taizhou_zhou
group by id,content,time;
-------------------------------------以下是分组模型预测-----------------------------------
-- 3.1.2 算法判断是否重复/相似,并输出重复/相似列表
-- step1、【自定义的udtf函数说明】函数中定义了异常值返回2,且java 类中用hashmap接收hive的map<String,String>类型数据
-- 因为是hashmap非线程安全,并行计算会报java.util.HashMap$Node cannot be cast to java.util.HashMap$TreeNode
use cf;
-- 关闭向量化查询
-- set hive.vectorized.execution.enabled=false;
-- set hive.vectorized.execution.reduce.enabled=false;
add jar hdfs://tqHadoopCluster/cf/validate-map-result20220920-0.0.1-bin.jar;
create temporary function checkudf_hashmap as 'com.tianque.IssueCheckHashMapUDTF';
drop table if exists taizhou_all_check_hashmap_3d;
create table taizhou_all_check_hashmap_3d as
select
a.id as id,
a.content as content,
a.time as time,
t1.check_result as issame,
t1.check_ids as sameids
from taizhou_all_map_3d a
lateral view checkudf_hashmap(content, contentmap, 0.95) t1;
-- 启动
nohup spark-sql --master yarn --num-executors 4 --executor-memory 2G --executor-cores 2 --driver-memory 4G -f check.hql &
-- 【测试】结果检查
-- select issame, count(1) from taizhou_all_check_hashmap_3d group by issame;
-- 4 导出数据
spark-sql -e 'select * from cf.taizhou_model' | sed 's/,/,/g' > /home/admin/cf_temp/taizhou/taizhou_model.csv
2、指定分组数量的数据集:ntile(group num)
由于指定的参数是group num,所以每个group里的数据量需要自己提前估算下,避免因每个group里数量太多导致spark内存溢出。每个group的数据量=该表总数据量 / group-num。
-- 1、建表
use cf;
drop table if exists train;
create table train(
id string,
content string
) comment '模型预测文本'
row format delimited fields terminated by '\t';
-- 关联hdfs文件
load data inpath 'hdfs://tqHadoopCluster/cf/train.txt' into table train;
-- 2、分组: 使用ntile(xx)函数分组,这里指定分成30组
drop table if exists test_train_group;
create table test_train_group as
select id,content, ntile(30) over(order by id) as group_num from train;
--3、构建map
drop table if exists test_train_group_map;
create table test_train_group_map as
select /*+ repartition(30) */
group_num,
str_to_map(concat_ws(',',collect_set(concat_ws(':',cast(a.id as string), regexp_replace(a.content, ',', ''))))) as content_map
from
test_train_group a
group by group_num;
-------------------------------------以下是分组模型预测-----------------------------------
--4、分组预测
spark-sql --master yarn --num-executors 15 --executor-memory 4G --executor-cores 2 --driver-memory 5G
use cf;
add jar hdfs://tqHadoopCluster/cf/model-batch-eval-0.0.1-bin.jar;
create temporary function batchmap as 'com.tianque.udf.model.BatchMapUdf';
drop table if exists test_train_group_result;
create table test_train_group_result
as
select /*+ repartition(30) */
group_num, batchmap(content_map) as result_map
from test_train_group_map;
-- 5、列转行
drop table if exists test_train_group_model_result;
create table test_train_group_model_result
as
select b.group_num, t1.id, t1.result
from test_train_group_result b
lateral view explode(result_map) t1 as id, result;
-- select count(1),result from test_train_group_model_result group by result;
-- 6、关联模型结果和样本
drop table if exists test_train_result;
create table test_train_result
as
select a.id,b.content,a.result
from test_train_group_model_result a
join
test b
on a.id=b.id;
-- 7、输出表
spark-sql -e 'select * from cf.test_train_result' | sed 's/,/,/g' > /home/admin/cf_temp/train/test_train_result.csv
3、指定行范围
涉及:时间窗口函数over(partition by ... order by ... rows between .. and ..)
-- 1、excel 数据处理
-- 替换\t替换成空格
-- sed -i "s/\t//g" test.xlsx
step1:把excel中的换行符替换成空格(excel表格ctrl+F --> 在弹出框的右下角“特殊内容”中选“换行符”即可。或者直接输入excel中的换行符标识"^l")
step2:把替换后的excel另存为txt
step3:打开txt,再次另存为txt,但另存时的编码选择utf8(无BOM头)
step4:把utf 无BOM头的txt文件上传到大数据集群
-- 2、hive建表
use cf;
create table test(
id string,
serialnumber string,
subject string,
issuecontent string,
createorg string,
createdate string
) comment '邯郸'
row format delimited fields terminated by '\t';
load data inpath 'hdfs://tqHadoopCluster/txt/test.txt' into table test;
-- 3、算法模型预测重复和相似,并获取重复、相似的”id:相似度“列表
-- 3.1 获取每条数据的比较对象
-- 【重点】比较对象限定在:同一发生区域
-- 【思路】比较对象以map<string,string>类型存储在hive表,其中key-事件唯一标识,value-事件内容
-- 【实现】str_to_map函数、collect_list函数、窗口函数
-- 【注意点】str_to_map函数是根据英文逗号来识别一个key-value对的,所以要先处理掉文本中可能存在的英文逗号,否则会出现null。
-- 【函数】collect_list(xx) 把xx组合成list。如果后接时间窗口函数over(partition by ... order by ... rows between .. and ..),则组合的xx来源时间窗口内的数据;否则,xx取全表数据。
drop table if exists test_map_3d;
create table test_map_3d as
select
a.id,
a.issuecontent,
a.place,
str_to_map(
concat_ws(",",collect_list(concat_ws(':', cast(id as string), regexp_replace(issuecontent, ',|:', ',')))
over(
partition by a.place
order by a.id rows between 1 following and unbounded following)
)
) as contentmap
from test a;
-- 3.2 算法判断是否重复/相似,并输出重复/相似列表
-- 【自定义的udtf函数说明】函数中定义了异常值返回2,且java 类中用LazyMap接收hive的map<String,String>类型数据
-- step1、编写sql,内容如下:
use cf;
-- 关闭向量化查询
set hive.vectorized.execution.enabled=false;
set hive.vectorized.execution.reduce.enabled=false;
add jar hdfs://tqHadoopCluster/cf/validate-lazymap-result-0.0.1-bin.jar;
create temporary function checkudf_lazymap as 'com.tianque.IssueCheckLazyMapUDTF';
drop table if exists test_check_lazymap_3d;
create table test_check_lazymap_3d as
select
a.id as id,
a.issuecontent as issuecontent,
a.place as place,
t1.check_result as issame,
t1.check_ids as sameids,
t2.check_result as issimilarity,
t2.check_ids as similarityids
from test_map_3d a
lateral view checkudf_lazymap(issuecontent, contentmap, 0.80) t1
lateral view checkudf_lazymap(issuecontent, contentmap, 0.60) t2;
-- step2、hive后台执行命令
[admin@hadoop1 cf_temp]$ ll
-rw-rw-r--. 1 admin admin 626 5月 19 14:11 check.sql
[admin@hadoop1 cf_temp]$ nohup hive -f check.sql &