Bootstrap

【Trino实战】Trino下ORC与Parquet查询性能分析

Trino下ORC与Parquet查询性能分析

环境

  • OS:CentOS 6.5

  • JDK:1.8

  • 内存:256G

  • 磁盘:HDD

  • CPU:Dual 8-core Intel® Xeon® CPU (32 Hyper-Threads) E5-2630 v3 @ 2.40GHz

  • HDFS:2.9.2

  • Hive:2.3.9

  • Trino:418

借助Trino对以下格式文件的查询耗时,来分析不同格式文件的查询效率

  • ORC
  • Parquet
  • TextFile
  • RCFile

实验数据准备

  1. 创建对应hive表

    ## 创建json临时表
     create table tmpjson(line string) row format delimited fields terminated by "\n";
    ## 利用hive客户端加载本地json数据文件
     LOAD DATA LOCAL INPATH '/opt/documents.json' OVERWRITE INTO TABLE test_trino.tmpjson;
    
    ## 创建国家主要城市表
     CREATE TABLE `test_trino.all_countries_orc`(
      `geonameid` bigint COMMENT '地名ID', 
      `name` string COMMENT '地名', 
      `latitude` double COMMENT '纬度',
      `longitude` double COMMENT '经度', 
      `country_code` string COMMENT '国家编码',
      `population` bigint COMMENT '城市人口数')
    COMMENT '国家城市表'
    ROW FORMAT SERDE 
      'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
    STORED AS INPUTFORMAT 
      'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
    OUTPUTFORMAT 
      'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';
      
     CREATE TABLE `test_trino.all_countries_parquet`(
      `geonameid` bigint COMMENT '地名ID', 
      `name` string COMMENT '地名', 
      `latitude` double COMMENT '纬度',
      `longitude` double COMMENT '经度', 
      `country_code` string COMMENT '国家编码',
      `population` bigint COMMENT '城市人口数')
    COMMENT '国家城市表' STORED AS PARQUET;
    
     CREATE TABLE `test_trino.all_countries_rcf`(
      `geonameid` bigint COMMENT '地名ID', 
      `name` string COMMENT '地名', 
      `latitude` double COMMENT '纬度',
      `longitude` double COMMENT '经度', 
      `country_code` string COMMENT '国家编码',
      `population` bigint COMMENT '城市人口数')
    COMMENT '国家城市表' STORED AS RCFILE;
    
     CREATE TABLE `test_trino.all_countries_text`(
      `geonameid` bigint COMMENT '地名ID', 
      `name` string COMMENT '地名', 
      `latitude` double COMMENT '纬度',
      `longitude` double COMMENT '经度', 
      `country_code` string COMMENT '国家编码',
      `population` bigint COMMENT '城市人口数')
    COMMENT '国家城市表' STORED AS TEXTFILE;
     
     DESCRIBE hive.test_trino.all_countries_parquet;
     DESCRIBE hive.test_trino.all_countries_orc;
    
  2. 不采用任务压缩算法,将数据写入hive表

    # 不采用压缩,hive客户端会话配置
    SET hive.exec.compress.output=false;
    SET mapreduce.output.fileoutputformat.compress=false;
    
    # 数据复写到内部表all_countries_parquet中
     insert overwrite table `test_trino.all_countries_parquet` select json_tuple(line,'geonameid','name','latitude','longitude','country_code','population')as(geonameid,name,latitude,longitude,country_code,population) from test_trino.tmpjson;
     
    # 数据复写到内部表all_countries_orc中
      insert overwrite table `test_trino.all_countries_orc` select json_tuple(line,'geonameid','name','latitude','longitude','country_code','population')as(geonameid,name,latitude,longitude,country_code,population) from test_trino.tmpjson;
      
    # 数据复写到内部表all_countries_rcf中
      insert overwrite table `test_trino.all_countries_rcf` select json_tuple(line,'geonameid','name','latitude','longitude','country_code','population')as(geonameid,name,latitude,longitude,country_code,population) from test_trino.tmpjson;
      
    # 数据复写到内部表all_countries_text中
      insert overwrite table `test_trino.all_countries_text` select json_tuple(line,'geonameid','name','latitude','longitude','country_code','population')as(geonameid,name,latitude,longitude,country_code,population) from test_trino.tmpjson;
    

查询范围

验证范围

  • 单列
    • long型
    • string型
    • double型
  • 多列
  • 聚合
  • 条件聚合

查询语句

select count(1) from hive.test_trino.all_countries_parquet;
select count(1) from hive.test_trino.all_countries_orc;
select count(1) from hive.test_trino.all_countries_rcf;
select count(1) from hive.test_trino.all_countries_text;

select geonameid from hive.test_trino.all_countries_orc limit 10;
select geonameid from hive.test_trino.all_countries_parquet limit 10;
select geonameid from hive.test_trino.all_countries_rcf limit 10;
select geonameid from hive.test_trino.all_countries_text limit 10;

select name from hive.test_trino.all_countries_orc limit 10;
select name from hive.test_trino.all_countries_parquet limit 10;
select name from hive.test_trino.all_countries_rcf limit 10;
select name from hive.test_trino.all_countries_text limit 10;

select latitude from hive.test_trino.all_countries_orc limit 10;
select latitude from hive.test_trino.all_countries_parquet limit 10;
select latitude from hive.test_trino.all_countries_rcf limit 10;
select latitude from hive.test_trino.all_countries_text limit 10;

select geonameid,name from hive.test_trino.all_countries_orc limit 10;
select geonameid,name from hive.test_trino.all_countries_parquet limit 10;
select geonameid,name from hive.test_trino.all_countries_rcf limit 10;
select geonameid,name from hive.test_trino.all_countries_text limit 10;

select country_code,sum(population) from hive.test_trino.all_countries_orc group by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_parquet group by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_rcf group by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_text group by country_code;

select country_code,sum(population) from hive.test_trino.all_countries_orc group by country_code order by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_parquet group by country_code order by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_rcf group by country_code order by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_text group by country_code order by country_code;


select country_code,sum(population) from hive.test_trino.all_countries_orc where country_code='CN' and population>10000 group by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_parquet where country_code='CN' and population>10000 group by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_rcf where country_code='CN' and population>10000 group by country_code;
select country_code,sum(population) from hive.test_trino.all_countries_text where country_code='CN' and population>10000 group by country_code;

结果分析

千万级数据Hive存储对比
Trino对格式文件的查询
Trino千万级数据查询耗时

;