一、目的
在数据质量模块,需要对原始数据的重复性进行统计
Hive中原有SQL语句和ClickHouse现有SQL语句很大不同
二、Hive中原有代码
2.1 表结构
--41、八大类基础数据重复性统计表 事件+事件资源不需要重复 create table if not exists hurys_db.dwd_data_duplicate( data_type int comment '1:转向比,2:统计,3:评价,4:区域,5:过车,6:静态排队,7:动态排队,8:轨迹,9:事件数据,10:事件资源', device_no string comment '设备编号', data_duplicate float comment '数据重复率' ) comment '数据重复性统计表' partitioned by (day string) stored as orc ;
2.2 SQL代码
insert overwrite table hurys_db.dwd_data_duplicate partition(day) select '6' data_type, device_no, round(sum(num)/count_num,2) data_duplicate, day from (select device_no, create_time, lane_no, count(1) num, count_num, day from (select device_no, create_time, lane_no, count(device_no) over (partition by device_no,day) count_num, day from hurys_db.ods_queue where day = '2024-09-04' ) as t1 group by device_no, create_time, lane_no, count_num, day having count(1) > 1 ) as t3 group by device_no, count_num, day;
三、ClickHouse中现有代码
3.1 表结构
--41、八大类基础数据重复性统计表(长期存储) create table if not exists hurys_jw.dwd_data_duplicate( data_type Int32 comment '1:转向比,2:统计,3:评价,4:区域,5:过车,6:静态排队,7:动态排队,8:轨迹,9:事件数据,10:事件资源', device_no String comment '设备编号', data_duplicate Decimal(10, 2) comment '数据重复率', day Date comment '日期' ) ENGINE = MergeTree PARTITION BY day PRIMARY KEY day ORDER BY day SETTINGS index_granularity = 8192;
3.2 SQL代码
select
'6' data_type,
device_no,
round(sum(num)/count_num,2) data_duplicate,
day
from (select
device_no,
create_time,
lane_no,
count(1) num,
count_num,
day
from (select device_no,
create_time,
lane_no,
count(device_no) over (partition by device_no,DATE(create_time)) AS count_num,
DATE(create_time) day
from hurys_jw.ods_queue
where day = '2024-10-22' -- where day > ?
) as t1
group by device_no, create_time, lane_no, count_num, day
having count(1) > 1
) as t3
group by device_no, count_num, day;
3.3 Kettle任务
3.3.1 newtime
3.3.2 替换NULL值
3.3.3 clickhouse输入1
select
'6' data_type,
device_no,
round(sum(num)/count_num,2) data_duplicate,
cast(day as String) day
from (select
device_no,
create_time,
lane_no,
count(1) num,
count_num,
day
from (select device_no,
create_time,
lane_no,
count(device_no) over (partition by device_no,DATE(create_time)) AS count_num,
DATE(create_time) day
from hurys_jw.ods_queue
where day > ?
) as t1
group by device_no, create_time, lane_no, count_num, day
having count(1) > 1
) as t3
group by device_no, count_num, day
;
其他clickhouse输入控件代码类似
3.3.4 字段选择
3.3.5 clickhouse输出
3.3.6 执行任务
3.3.7 海豚调度(1天1次)
ClickHosue的SQL语句与Hive真的好多地方不一样,尤其是函数!