Trino下ORC与Parquet查询性能分析
环境
-
OS:CentOS 6.5
-
JDK:1.8
-
内存:256G
-
磁盘:HDD
-
CPU:Dual 8-core Intel® Xeon® CPU (32 Hyper-Threads) E5-2630 v3 @ 2.40GHz
-
HDFS:2.9.2
-
Hive:2.3.9
-
Trino:418
借助Trino对以下格式文件的查询耗时,来分析不同格式文件的查询效率
- ORC
- Parquet
- TextFile
- RCFile
实验数据准备
-
创建对应hive表
## 创建json临时表 create table tmpjson(line string) row format delimited fields terminated by "\n"; ## 利用hive客户端加载本地json数据文件 LOAD DATA LOCAL INPATH '/opt/documents.json' OVERWRITE INTO TABLE test_trino.tmpjson; ## 创建国家主要城市表 CREATE TABLE `test_trino.all_countries_orc`( `geonameid` bigint COMMENT '地名ID', `name` string COMMENT '地名', `latitude` double COMMENT '纬度', `longitude` double COMMENT '经度', `country_code` string COMMENT '国家编码', `population` bigint COMMENT '城市人口数') COMMENT '国家城市表' ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'; CREATE TABLE `test_trino.all_countries_parquet`( `geonameid` bigint COMMENT '地名ID', `name` string COMMENT '地名', `latitude` double COMMENT '纬度', `longitude` double COMMENT '经度', `country_code` string COMMENT '国家编码', `population` bigint COMMENT '城市人口数') COMMENT '国家城市表' STORED AS PARQUET; CREATE TABLE `test_trino.all_countries_rcf`( `geonameid` bigint COMMENT '地名ID', `name` string COMMENT '地名', `latitude` double COMMENT '纬度', `longitude` double COMMENT '经度', `country_code` string COMMENT '国家编码', `population` bigint COMMENT '城市人口数') COMMENT '国家城市表' STORED AS RCFILE; CREATE TABLE `test_trino.all_countries_text`( `geonameid` bigint COMMENT '地名ID', `name` string COMMENT '地名', `latitude` double COMMENT '纬度', `longitude` double COMMENT '经度', `country_code` string COMMENT '国家编码', `population` bigint COMMENT '城市人口数') COMMENT '国家城市表' STORED AS TEXTFILE; DESCRIBE hive.test_trino.all_countries_parquet; DESCRIBE hive.test_trino.all_countries_orc;
-
不采用任务压缩算法,将数据写入hive表
# 不采用压缩,hive客户端会话配置 SET hive.exec.compress.output=false; SET mapreduce.output.fileoutputformat.compress=false; # 数据复写到内部表all_countries_parquet中 insert overwrite table `test_trino.all_countries_parquet` select json_tuple(line,'geonameid','name','latitude','longitude','country_code','population')as(geonameid,name,latitude,longitude,country_code,population) from test_trino.tmpjson; # 数据复写到内部表all_countries_orc中 insert overwrite table `test_trino.all_countries_orc` select json_tuple(line,'geonameid','name','latitude','longitude','country_code','population')as(geonameid,name,latitude,longitude,country_code,population) from test_trino.tmpjson; # 数据复写到内部表all_countries_rcf中 insert overwrite table `test_trino.all_countries_rcf` select json_tuple(line,'geonameid','name','latitude','longitude','country_code','population')as(geonameid,name,latitude,longitude,country_code,population) from test_trino.tmpjson; # 数据复写到内部表all_countries_text中 insert overwrite table `test_trino.all_countries_text` select json_tuple(line,'geonameid','name','latitude','longitude','country_code','population')as(geonameid,name,latitude,longitude,country_code,population) from test_trino.tmpjson;
查询范围
验证范围
- 单列
- long型
- string型
- double型
- 多列
- 聚合
- 条件聚合
查询语句
select count(1) from hive.test_trino.all_countries_parquet;
select count(1) from hive.test_trino.all_countries_orc;
select count(1) from hive.test_trino.all_countries_rcf;
select count(1) from hive.test_trino.all_countries_text;
select geonameid from hive.test_trino.all_countries_orc limit 10;
select geonameid from hive.test_trino.all_countries_parquet limit 10;
select geonameid from hive.test_trino.all_countries_rcf limit 10;
select geonameid from hive.test_trino.all_countries_text limit 10;
select name from hive.test_trino.all_countries_orc limit 10;
select name from hive.test_trino.all_countries_parquet limit 10;
select name from hive.test_trino.all_countries_rcf limit 10;
select name from hive.test_trino.all_countries_text limit 10;
select latitude from hive.test_trino.all_countries_orc limit 10;
select latitude from hive.test_trino.all_countries_parquet limit 10;
select latitude from hive.test_trino.all_countries_rcf limit 10;
select latitude from hive.test_trino.all_countries_text limit 10;
select geonameid,name from hive.test_trino.all_countries_orc limit 10;
select geonameid,name from hive.test_trino.all_countries_parquet limit 10;
select geonameid,name from hive.test_trino.all_countries_rcf limit 10;
select geonameid,name from hive.test_trino.all_countries_text limit 10;
select country_code,sum(population) from hive.test_trino.all_countries_orc group by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_parquet group by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_rcf group by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_text group by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_orc group by country_code order by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_parquet group by country_code order by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_rcf group by country_code order by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_text group by country_code order by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_orc where country_code='CN' and population>10000 group by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_parquet where country_code='CN' and population>10000 group by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_rcf where country_code='CN' and population>10000 group by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_text where country_code='CN' and population>10000 group by country_code;
结果分析