ClickHouse 常用函数

ClickHouse 常用函数（工作笔记待补充）

1 时间函数

dateDiff

计算时间差：
select dateDiff('day',parseDateTimeBestEffort(toString(20210601)),today() ) as day_diff;

day_diff|
--------|
      30|

DATEDIFF('hour', toDateTime(last_date_time), toDateTime(min_date_time)) hourDiff

parseDateTimeBestEffort

select parseDateTimeBestEffort(toString(20210601));

parseDateTimeBestEffort(toString(20210601))|
-------------------------------------------|
                        2021-06-01 00:00:00|
 
SELECT parseDateTimeBestEffort('23/10/2020 12:12:57')
AS parseDateTimeBestEffort; 
 
SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Asia/Istanbul')
AS parseDateTimeBestEffort;

toYear/toMonth/toYearWeek

获取日期的年月周

select 	parseDateTimeBestEffort(toString(20200101)) dayDate	, toYearWeek(dayDate, 3) weekInt,toYear(dayDate) yearInt, toMonth(dayDate) monthInt;

dayDate            |weekInt|yearInt|monthInt|
-------------------|-------|-------|--------|
2020-01-01 00:00:00| 202001|   2020|       1|

formatDateTime

-- 日期类型(DateTime)转为指定格式的字符串	
	and dx_part_date > formatDateTime(date_sub(day,30,today()), '%Y-%m-%d') 
	and dx_part_date <= formatDateTime(today(), '%Y-%m-%d')

FROM_UNIXTIME

-- 毫秒时间戳(BigInt)转为指定格式的字符串
select FROM_UNIXTIME(toInt32(1675242721092/1000),'%Y-%m-%d');
select FROM_UNIXTIME(toInt32(1675242721092/1000),'%Y-%m-%d %R:00');
select FROM_UNIXTIME(cast(dx_event_time/1000 as bigint),'%Y-%m-%d %R:%S');
select FROM_UNIXTIME(cast(dx_event_time/1000 as bigint),'%Y-%m-%d %H:%M:%S')

toYYYYMMDD

-- 将日期类型转为 UInt类型：
select toYYYYMMDD(date_sub(DAY,30,today())) as dayInt
results:20230115

select 
	date_key
	,formatDateTime(parseDateTimeBestEffort(toString(date_key)),'%Y-%m-%d') as day 
	,toUnixTimestamp(day)*1000 dayTs
from active_calendar_new 
where date_key > toYYYYMMDD(date_sub(day,30,today())) 
and date_key <= toYYYYMMDD(today()) 
and dayTs  BETWEEN 1672848000000 and 1675439999999 
results:20230116	2023-01-16	1673798400000

2 聚合函数

argMax

语法:argMax(arg, val)
计算 val 最大值对应的 arg 值。 如果 val 最大值存在几个不同的 arg 值，输出遇到的第一个值。
这个函数的Tuple版本将返回 val 最大值对应的元组。本函数适合和 SimpleAggregateFunction 搭配使用。
同一字段day的某一个值有多条记录，获取字段receivetime最大的一条记录

select argMax(reg.day, reg.receivetime) regDay,argMax(ul.day, reg.receivetime) activeDay,

SELECT argMax(user, salary), argMax(tuple(user, salary), salary), argMax(tuple(user, salary)) FROM salary;
┌─argMax(user, salary)─┬─argMax(tuple(user, salary), salary)─┬─argMax(tuple(user, salary))─┐
│ director             │ ('director',5000)                   │ ('director',5000)           │
└──────────────────────┴─────────────────────────────────────┴─────────────────────────────┘

			select 
			dx_event_name,
		    dx_part_date,
		    dx_event_time,
		    dx_user_id,
		    dx_uuid
		    ,mission_id
		    ,arrayJoin(warship_info_cfgId) warship_id
		    from
			(
				select 
				dx_event_name,
			    dx_part_date,
			    dx_event_time,
			    dx_user_id,
			    dx_uuid
			    ,argMax(mission_id ,op_version) as mission_id
			    ,argMax(warship_info ,op_version) as warship_info
			    ,arrayMap(x -> JSONExtractInt(x, 'cfgId'), JSONExtractArrayRaw(warship_info)) as warship_info_cfgId
				from  dx_data.ds_v_event_101 
				where op_tag =1
				and dx_part_date > formatDateTime(date_sub(day,30,today()), '%Y-%m-%d') and dx_part_date <= formatDateTime(today(), '%Y-%m-%d')
				and dx_event_name in ('battle_mission')
				GROUP BY
			    dx_event_name,
			    dx_part_date,
			    dx_event_time,
			    dx_user_id,
			    dx_uuid  
			) bm

uniqExact(x)

计算精确的排重值。
(case when dayDiff < 1 then NULL else IFNULL(uniqExact(activeuuid2), 0) end) retention2,

3 其他常用函数

full join

a full join b USING (column1,column2... )

groupArray

类似mysql的 group_concat()函数
作用：多行数据合成一行，得到数组字段
select pid,groupArray(chncode) chncodeArray,groupArray(chnname) chnnameArray from odsmysql_wan_promotion_channel_v3 where pid = 1 group by pid
 
pid|chncodeArray  |chnnameArray |
---|--------------|-------------|
  1|['1_2','1_58']|['测试','测试默认']|

if

控制条件分支。 与大多数系统不同，ClickHouse始终评估两个表达式 then 和 else。

语法
SELECT if(cond, then, else)
如果条件 cond 的计算结果为非零值，则返回表达式 then 的结果，并且跳过表达式 else 的结果（如果存在）。 如果 cond 为零或 NULL，则将跳过 then 表达式的结果，并返回 else 表达式的结果（如果存在）。

参数
cond – 条件结果可以为零或不为零。 类型是 UInt8，Nullable(UInt8) 或 NULL。
then - 如果满足条件则返回的表达式。
else - 如果不满足条件则返回的表达式。
返回值
该函数执行 then 和 else 表达式并返回其结果，这取决于条件 cond 最终是否为零。

示例
查询:
SELECT if(1, plus(2, 2), plus(2, 6))
结果:
┌─plus(2, 2)─┐
│          4 │
└────────────┘

multiIf

允许您在查询中更紧凑地编写CASE运算符。
multiIf(cond_1, then_1, cond_2, then_2...else)
参数:
cond_N — 函数返回then_N的条件。
then_N — 执行时函数的结果。
else — 如果没有满足任何条件，则为函数的结果。
该函数接受2N + 1参数。

返回值
该函数返回值«then_N»或«else»之一，具体取决于条件cond_N。

示例
SELECT
    left,
    right,
    multiIf(left < right, 'left is smaller', left > right, 'left is greater', left = right, 'Both equal', 'Null value') AS result
FROM LEFT_RIGHT

┌─left─┬─right─┬─result──────────┐
│ ᴺᵁᴸᴸ │     4 │ Null value      │
│    1 │     3 │ left is smaller │
│    2 │     2 │ Both equal      │
│    3 │     1 │ left is greater │
│    4 │  ᴺᵁᴸᴸ │ Null value      │
└──────┴───────┴─────────────────┘

round

 -- 计算百分率，小数点后保留两位有效数字，如66.66%
 
 示例：
 contcat(round(if(activeUserNums > 0, divide(doneUserNums * 100.00, activeUserNums), 0), 2),'%')

extractAll

正则匹配方式将字符串转为数组

示例：
SELECT extractAll('[101,202,103,104]', '\d+');

arrayJoin

这是一个非常有用的函数。
普通函数不会更改结果集的行数，而只是计算每行中的值（map）。
聚合函数将多行压缩到一行中（fold或reduce）。
’arrayJoin’函数获取每一行并将他们展开到多行（unfold）。
此函数将数组作为参数，并将该行在结果集中复制数组元素个数。
除了应用此函数的列中的值之外，简单地复制列中的所有值;它被替换为相应的数组值。
查询可以使用多个arrayJoin函数。在这种情况下，转换被执行多次。
请注意SELECT查询中的ARRAY JOIN语法，它提供了更广泛的可能性。

示例:

SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src
┌─dst─┬─\'Hello\'─┬─src─────┐
│   1 │ Hello     │ [1,2,3] │
│   2 │ Hello     │ [1,2,3] │
│   3 │ Hello     │ [1,2,3] │
└─────┴───────────┴─────────┘

arrayMap

示例：
enemy_info：[{"cfgId":2090017,"id":-1,"level":1,"star":1,"type":2},{"cfgId":2160017,"id":-2,"level":1,"star":1,"type":2},{"cfgId":2150017,"id":-3,"level":1,"star":1,"type":2},{"cfgId":2150017,"id":-4,"level":1,"star":1,"type":2},{"cfgId":2100017,"id":-5,"level":1,"star":1,"type":2},{"cfgId":2020017,"id":-6,"level":1,"star":1,"type":2}]

alive_warship：[105,185,195,130,135,140,145,155]

SELECT enemy_info 
		,arrayMap(x -> JSONExtractInt(x, 'cfgId'), JSONExtractArrayRaw(enemy_info)) as enemy_info_cfgId
		,length(enemy_info_cfgId) nums
		,alive_warship 
		,splitByString(',',SUBSTRING(alive_warship,2,length(alive_warship)-2)) as aliveWarshipArr
		,extractAll(alive_warship, '\d+') as aliveWarshipArr2
from  dx_data.ds_v_event_101 
where enemy_info != ''

arrayZip

--  将两个数组类型字段( Array(DateTime)、Array(UInt8) )转化为：Array(Tuple(DateTime,UInt8))类型字段

select 
toString(num1) num1,num2
from 
(
	select arrayJoin(arrZip) aj
	,aj.1 as num1
	,aj.2 as num2
	from 
	(
	select '2012-01-01 12:20:00' as field1 , toDateTime(field1) as fiveSecondsTime, 
	timeSlots(fiveSecondsTime,toUInt32(55),5) as fiveSecondsArr,
	[12,11,10,9,8,7,6,5,4,3,202,101] as arr,
	arrayZip(fiveSecondsArr,arr) arrZip
	) res
) res

timeSlots

-- 时间打点函数
-- 参数1：起始时间字段，DateTime类型；参数2：截止时间与起始时间的间隔差，UInt32类型；参数3：点与点的时间间隔，int类型；
select '2012-01-01 12:20:00' as field1 , toDateTime(field1) as fiveSecondsTime, 
	timeSlots(fiveSecondsTime,toUInt32(55),5) as fiveSecondsArr

WITH

WITH
    (
        SELECT sum(bytes)
        FROM system.parts
        WHERE active
    ) AS total_disk_usage
SELECT
    concat(toString(round((sum(bytes) / total_disk_usage) * 100,2)),'%') AS table_disk_usage,
    table
FROM system.parts
GROUP BY table
ORDER BY table_disk_usage DESC
LIMIT 100

限制

不支持递归查询。
当在section中使用子查询时，它的结果应该是只有一行的标量。
Expression的结果在子查询中不可用。

窗口函数

需求：根据dx_event_name分区，dx_event_time排序，取排第一的那条数据

	select
		*
	from
		(
		select
			dx_event_name ,dx_part_date ,dx_event_time ,dx_user_id ,
			row_number() over(partition by dx_event_name order by dx_event_time desc) firstRecord
		from
			v_event_1635208849778401280
		where
			dx_event_time between 1677513600000 and 1678809599999
			) t
	where
		firstRecord = 1

取当前记录的前一条记录和后一条记录的字段

	with t_duration as 
	(
	select dx_event_name,ag_dx_account_id as dx_account_id,dx_part_date,ag_product_id,date_time
		  ,first_value(dx_event_name) over(PARTITION by dx_account_id order by date_time rows between 1 preceding and 1 preceding) as last_dx_event_name
		  ,first_value(ag_product_id) over(PARTITION by dx_account_id order by date_time rows between 1 preceding and 1 preceding) as last_product_id
	      ,first_value(date_time) over(PARTITION by dx_account_id order by date_time rows between 1 preceding and 1 preceding) as last_date_time
	      ,first_value(dx_event_name) over(PARTITION by dx_account_id order by date_time rows between 1 following and 1 following) as next_dx_event_name
	      ,first_value(ag_product_id) over(PARTITION by dx_account_id order by date_time rows between 1 following and 1 following) as next_product_id
	      ,first_value(date_time) over(PARTITION by dx_account_id order by date_time rows between 1 following and 1 following) as next_date_time
	from 
		(
			SELECT dx_event_name,dx_part_date,
			FROM_UNIXTIME(cast(dx_event_time/1000 as bigint),'%Y-%m-%d %R:%S') as date_time,
			argMax(dx_account_id,op_version) ag_dx_account_id,
			argMax(product_id,op_version) ag_product_id
			FROM dx_data.v_event_1460422998065549312
			WHERE op_tag = 1 
			AND dx_event_name in ('reg','pay')
			AND dx_event_time   BETWEEN 1678032000000 and 1680623999999 
			AND dx_lib = 'Java'
			AND dx_account_id != ''
			and app_id in (1,2,3,5,6,12)
			GROUP BY
		    dx_event_name,dx_part_date,dx_event_time,dx_user_id,dx_uuid    
		) t
	)

官网案例：

https://clickhouse.com/docs/en/sql-reference/window-functions#clickhouse-specific-window-functions

CREATE TABLE wf_frame
(
    `part_key` UInt64,
    `value` UInt64,
    `order` UInt64
)
ENGINE = Memory;

INSERT INTO wf_frame FORMAT Values
   (1,1,1), (1,2,2), (1,3,3), (1,4,4), (1,5,5);

-- frame is bounded by bounds of a partition (BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
SELECT
    part_key,
    value,
    order,
    groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC
         Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS frame_values
FROM wf_frame
ORDER BY
    part_key ASC,
    value ASC;
    
┌─part_key─┬─value─┬─order─┬─frame_values─┐
│        1 │     1 │     1 │ [1,2,3,4,5]  │
│        1 │     2 │     2 │ [1,2,3,4,5]  │
│        1 │     3 │     3 │ [1,2,3,4,5]  │
│        1 │     4 │     4 │ [1,2,3,4,5]  │
│        1 │     5 │     5 │ [1,2,3,4,5]  │
└──────────┴───────┴───────┴──────────────┘

-- short form - no bound expression, no order by
SELECT
    part_key,
    value,
    order,
    groupArray(value) OVER (PARTITION BY part_key) AS frame_values
FROM wf_frame
ORDER BY
    part_key ASC,
    value ASC;
┌─part_key─┬─value─┬─order─┬─frame_values─┐
│        1 │     1 │     1 │ [1,2,3,4,5]  │
│        1 │     2 │     2 │ [1,2,3,4,5]  │
│        1 │     3 │     3 │ [1,2,3,4,5]  │
│        1 │     4 │     4 │ [1,2,3,4,5]  │
│        1 │     5 │     5 │ [1,2,3,4,5]  │
└──────────┴───────┴───────┴──────────────┘

-- frame is bounded by the beggining of a partition and the current row
SELECT
    part_key,
    value,
    order,
    groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC
          Rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS frame_values
FROM wf_frame
ORDER BY
    part_key ASC,
    value ASC;

┌─part_key─┬─value─┬─order─┬─frame_values─┐
│        1 │     1 │     1 │ [1]          │
│        1 │     2 │     2 │ [1,2]        │
│        1 │     3 │     3 │ [1,2,3]      │
│        1 │     4 │     4 │ [1,2,3,4]    │
│        1 │     5 │     5 │ [1,2,3,4,5]  │
└──────────┴───────┴───────┴──────────────┘

-- short form (frame is bounded by the beggining of a partition and the current row)
SELECT
    part_key,
    value,
    order,
    groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC) AS frame_values
FROM wf_frame
ORDER BY
    part_key ASC,
    value ASC;
┌─part_key─┬─value─┬─order─┬─frame_values─┐
│        1 │     1 │     1 │ [1]          │
│        1 │     2 │     2 │ [1,2]        │
│        1 │     3 │     3 │ [1,2,3]      │
│        1 │     4 │     4 │ [1,2,3,4]    │
│        1 │     5 │     5 │ [1,2,3,4,5]  │
└──────────┴───────┴───────┴──────────────┘

-- frame is bounded by the beggining of a partition and the current row, but order is backward
SELECT
    part_key,
    value,
    order,
    groupArray(value) OVER (PARTITION BY part_key ORDER BY order DESC) AS frame_values
FROM wf_frame
ORDER BY
    part_key ASC,
    value ASC;
┌─part_key─┬─value─┬─order─┬─frame_values─┐
│        1 │     1 │     1 │ [5,4,3,2,1]  │
│        1 │     2 │     2 │ [5,4,3,2]    │
│        1 │     3 │     3 │ [5,4,3]      │
│        1 │     4 │     4 │ [5,4]        │
│        1 │     5 │     5 │ [5]          │
└──────────┴───────┴───────┴──────────────┘

-- sliding frame - 1 PRECEDING ROW AND CURRENT ROW
SELECT
    part_key,
    value,
    order,
    groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC
          Rows BETWEEN 1 PRECEDING AND CURRENT ROW) AS frame_values
FROM wf_frame
ORDER BY
    part_key ASC,
    value ASC;

┌─part_key─┬─value─┬─order─┬─frame_values─┐
│        1 │     1 │     1 │ [1]          │
│        1 │     2 │     2 │ [1,2]        │
│        1 │     3 │     3 │ [2,3]        │
│        1 │     4 │     4 │ [3,4]        │
│        1 │     5 │     5 │ [4,5]        │
└──────────┴───────┴───────┴──────────────┘

-- sliding frame - Rows BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING 
SELECT
    part_key,
    value,
    order,
    groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC
          Rows BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING) AS frame_values
FROM wf_frame
ORDER BY
    part_key ASC,
    value ASC;
┌─part_key─┬─value─┬─order─┬─frame_values─┐
│        1 │     1 │     1 │ [1,2,3,4,5]  │
│        1 │     2 │     2 │ [1,2,3,4,5]  │
│        1 │     3 │     3 │ [2,3,4,5]    │
│        1 │     4 │     4 │ [3,4,5]      │
│        1 │     5 │     5 │ [4,5]        │
└──────────┴───────┴───────┴──────────────┘

-- row_number does not respect the frame, so rn_1 = rn_2 = rn_3 != rn_4
SELECT
    part_key,
    value,
    order,
    groupArray(value) OVER w1 AS frame_values,
    row_number() OVER w1 AS rn_1,
    sum(1) OVER w1 AS rn_2,
    row_number() OVER w2 AS rn_3,
    sum(1) OVER w2 AS rn_4
FROM wf_frame
WINDOW
    w1 AS (PARTITION BY part_key ORDER BY order DESC),
    w2 AS (PARTITION BY part_key ORDER BY order DESC 
                   Rows BETWEEN 1 PRECEDING AND CURRENT ROW)
ORDER BY
    part_key ASC,
    value ASC;
┌─part_key─┬─value─┬─order─┬─frame_values─┬─rn_1─┬─rn_2─┬─rn_3─┬─rn_4─┐
│        1 │     1 │     1 │ [5,4,3,2,1]  │    5 │    5 │    5 │    2 │
│        1 │     2 │     2 │ [5,4,3,2]    │    4 │    4 │    4 │    2 │
│        1 │     3 │     3 │ [5,4,3]      │    3 │    3 │    3 │    2 │
│        1 │     4 │     4 │ [5,4]        │    2 │    2 │    2 │    2 │
│        1 │     5 │     5 │ [5]          │    1 │    1 │    1 │    1 │
└──────────┴───────┴───────┴──────────────┴──────┴──────┴──────┴──────┘

-- first_value and last_value respect the frame
SELECT
    groupArray(value) OVER w1 AS frame_values_1,
    first_value(value) OVER w1 AS first_value_1,
    last_value(value) OVER w1 AS last_value_1,
    groupArray(value) OVER w2 AS frame_values_2,
    first_value(value) OVER w2 AS first_value_2,
    last_value(value) OVER w2 AS last_value_2
FROM wf_frame
WINDOW
    w1 AS (PARTITION BY part_key ORDER BY order ASC),
    w2 AS (PARTITION BY part_key ORDER BY order ASC Rows BETWEEN 1 PRECEDING AND CURRENT ROW)
ORDER BY
    part_key ASC,
    value ASC;
┌─frame_values_1─┬─first_value_1─┬─last_value_1─┬─frame_values_2─┬─first_value_2─┬─last_value_2─┐
│ [1]            │             1 │            1 │ [1]            │             1 │            1 │
│ [1,2]          │             1 │            2 │ [1,2]          │             1 │            2 │
│ [1,2,3]        │             1 │            3 │ [2,3]          │             2 │            3 │
│ [1,2,3,4]      │             1 │            4 │ [3,4]          │             3 │            4 │
│ [1,2,3,4,5]    │             1 │            5 │ [4,5]          │             4 │            5 │
└────────────────┴───────────────┴──────────────┴────────────────┴───────────────┴──────────────┘

-- second value within the frame
SELECT
    groupArray(value) OVER w1 AS frame_values_1,
    nth_value(value, 2) OVER w1 AS second_value
FROM wf_frame
WINDOW w1 AS (PARTITION BY part_key ORDER BY order ASC Rows BETWEEN 3 PRECEDING AND CURRENT ROW)
ORDER BY
    part_key ASC,
    value ASC
┌─frame_values_1─┬─second_value─┐
│ [1]            │            0 │
│ [1,2]          │            2 │
│ [1,2,3]        │            2 │
│ [1,2,3,4]      │            2 │
│ [2,3,4,5]      │            3 │
└────────────────┴──────────────┘

-- second value within the frame + Null for missing values
SELECT
    groupArray(value) OVER w1 AS frame_values_1,
    nth_value(toNullable(value), 2) OVER w1 AS second_value
FROM wf_frame
WINDOW w1 AS (PARTITION BY part_key ORDER BY order ASC Rows BETWEEN 3 PRECEDING AND CURRENT ROW)
ORDER BY
    part_key ASC,
    value ASC
┌─frame_values_1─┬─second_value─┐
│ [1]            │         ᴺᵁᴸᴸ │
│ [1,2]          │            2 │
│ [1,2,3]        │            2 │
│ [1,2,3,4]      │            2 │
│ [2,3,4,5]      │            3 │
└────────────────┴──────────────┘

4 关键字

FINAL 修饰符

当 FINAL 被指定，ClickHouse会在返回结果之前完全合并数据，从而执行给定表引擎合并期间发生的所有数据转换。

它适用于从使用 MergeTree-引擎族. 还支持:

Replicated 版本 MergeTree 引擎
View, Buffer, Distributed，和 MaterializedView 在其他引擎上运行的引擎，只要是它们底层是 MergeTree-引擎表即可。

现在使用 FINAL 修饰符的 SELECT 查询启用了并发执行, 这会快一点。但是仍然存在缺陷 (见下)。 max_final_threads 设置使用的最大线程数限制。

缺点

使用的查询 FINAL 执行速度比类似的查询慢一点，因为:

在查询执行期间合并数据。
查询与 FINAL 除了读取查询中指定的列之外，还读取主键列。

在大多数情况下，避免使用 FINAL. 常见的方法是使用假设后台进程的不同查询 MergeTree 引擎还没有发生，并通过应用聚合（例如，丢弃重复项）来处理它。

实现细节

如果 FROM 子句被省略，数据将从读取 system.one 表。
该 system.one 表只包含一行（此表满足与其他 DBMS 中的 DUAL 表有相同的作用）。

若要执行查询，将从相应的表中提取查询中列出的所有列。外部查询不需要的任何列都将从子查询中抛出。
如果查询未列出任何列（例如, SELECT count() FROM t），无论如何都会从表中提取一些列（首选是最小的列），以便计算行数。

N 案例

1 获取小时维度表

将数组的一行转换为多行数据

 SELECT toUInt8(arrayJoin([23 , 22 , 21 , 20 , 19 , 18 , 17 , 16 , 15 , 14 , 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0])) AS hour 
 
 hour|
----|
  23|
  22|
  21|
  20|
  19|
  18|
  17|
  16|
  15|
  14|
  13|
  12|
  11|
  10|
   9|
   8|
   7|
   6|
   5|
   4|
   3|
   2|
   1|
   0|

2 groupArray / groupUniqArray

Clickhouse> select province, groupArray(city) from t_city group by province;
 
SELECT 
    province, 
    groupArray(city)
FROM t_city
GROUP BY province
 
┌─province──┬─groupArray(city)─────────────────────────────┐
│ Shanghai  │ ['Shanghai']                                 │
│ Hubei     │ ['Wuhan','Xiangyang']                        │
│ Guangdong │ ['Guangzhou','Shenzhen','Dongguan','Zhuhai'] │
└───────────┴──────────────────────────────────────────────┘
 
插入一条重复的记录：
insert into t_city values('Hubei','Wuhan',now(),2);
 
可以看到Hubei有一个重复的wuhan
 
Clickhouse> select province, groupArray(city) from t_city group by province;
 
SELECT 
    province, 
    groupArray(city)
FROM t_city
GROUP BY province
 
┌─province──┬─groupArray(city)─────────────────────────────┐
│ Shanghai  │ ['Shanghai']                                 │
│ Hubei     │ ['Wuhan','Xiangyang','Wuhan']                │
│ Guangdong │ ['Guangzhou','Shenzhen','Dongguan','Zhuhai'] │
└───────────┴──────────────────────────────────────────────┘
 
3 rows in set. Elapsed: 0.002 sec. 
 
可以使用函数groupUniqArray进行去重：
 
Clickhouse> select province, groupUniqArray(city) from t_city group by province;
 
SELECT 
    province, 
    groupUniqArray(city)
FROM t_city
GROUP BY province
 
┌─province──┬─groupUniqArray(city)─────────────────────────┐
│ Shanghai  │ ['Shanghai']                                 │
│ Hubei     │ ['Wuhan','Xiangyang']                        │
│ Guangdong │ ['Zhuhai','Dongguan','Guangzhou','Shenzhen'] │
└───────────┴──────────────────────────────────────────────┘
 
3 rows in set. Elapsed: 0.003 sec.

3 行转列

select ag_foreign_key as foreign_key 
		,sum(case when ag_item_id = 1 then ag_change_count else 0 end) as cost_item_id_1
		,sum(case when ag_item_id = 50006 then ag_change_count else 0 end) as cost_item_id_50006
from 		
(
    select 
	dx_event_name,
    dx_part_date,
    dx_event_time,
    dx_user_id,
    dx_uuid,
    argMax(item_id,op_version) as ag_item_id
    ,argMax(change_count ,op_version) as ag_change_count
    ,argMax(foreign_key ,op_version) as ag_foreign_key
	from  dx_data.ds_v_event_101 
	where op_tag =1
	and dx_event_name in ('item')
	and item_id in (1,50006)
	and dx_part_date > formatDateTime(date_sub(day,30,today()), '%Y-%m-%d') and dx_part_date <= formatDateTime(today(), '%Y-%m-%d')
	and foreign_key is not null and foreign_key != '' 
	and change_count < 0
	GROUP BY
    dx_event_name,
    dx_part_date,
    dx_event_time,
    dx_user_id,
    dx_uuid
) cost1
group by ag_foreign_key

4 留存算法

SELECT dx_part_date AS "日期"
	, reg_CNT AS "注册用户数"
	, roi_1 AS "当日留存", roi_2 AS "次留" , roi_3 AS "3留" , roi_4 AS "4留" , roi_5 AS "5留" , roi_6 AS "6留" , roi_7 AS "7留"
FROM (
		WITH t_reg AS (
			SELECT DISTINCT dx_account_id , dx_part_date 
			FROM dx_data.v_event_1460422998065549312
			WHERE op_tag = 1 AND dx_event_name = 'reg'
					AND dx_lib = 'Java'
					AND dx_account_id != ''
					AND dx_event_time  BETWEEN 1677427200000 and 1680105599999   
	    ) 
		, t_login AS (
			SELECT DISTINCT dx_account_id , dx_part_date 
			FROM dx_data.v_event_1460422998065549312
			WHERE op_tag = 1 AND dx_event_name = 'login' 
					AND dx_lib = 'Java'
					AND dx_account_id != ''  
					AND dx_event_time  > 1677427200000
		) 

		SELECT dx_part_date
			, COUNT(DISTINCT dx_account_id)  AS reg_CNT
			, CONCAT(toString(ROUND(COUNT(IF(date_diff=1, 1,NULL))/reg_CNT*100, 2)), '%') AS roi_1
			, CONCAT(toString(ROUND(COUNT(IF(date_diff=2, 1,NULL))/reg_CNT*100, 2)), '%') AS roi_2
			, CONCAT(toString(ROUND(COUNT(IF(date_diff=3, 1,NULL))/reg_CNT*100, 2)), '%') AS roi_3
			, CONCAT(toString(ROUND(COUNT(IF(date_diff=4, 1,NULL))/reg_CNT*100, 2)), '%') AS roi_4
			, CONCAT(toString(ROUND(COUNT(IF(date_diff=5, 1,NULL))/reg_CNT*100, 2)), '%') AS roi_5
			, CONCAT(toString(ROUND(COUNT(IF(date_diff=6, 1,NULL))/reg_CNT*100, 2)), '%') AS roi_6
			, CONCAT(toString(ROUND(COUNT(IF(date_diff=7, 1,NULL))/reg_CNT*100, 2)), '%') AS roi_7
		FROM (
			SELECT t1.dx_part_date AS dx_part_date, DATEDIFF('day', toDate(t1.dx_part_date), toDate(t2.dx_part_date))+1 AS date_diff, t1.dx_account_id AS dx_account_id
			FROM t_reg t1 
			INNER JOIN t_login t2 ON t1.dx_account_id = t2.dx_account_id 
		)
		GROUP BY dx_part_date
) 
ORDER BY dx_part_date DESC
limit 1000

5 连续留存+存在留存算法

with t_pay as
	(
		SELECT dx_part_date,dx_account_id
		FROM dx_data.v_event_1460422998065549312
		WHERE op_tag = 1 
		AND dx_event_name = 'pay'
		AND dx_event_time   BETWEEN 1678032000000 and 1680623999999 
		AND dx_lib = 'Java'
		AND dx_account_id != ''
		and app_id in (1,2,3,5,6,12)
		and product_id = 306
		GROUP BY
	    dx_part_date,dx_account_id 	
	)

select
	dx_part_date as "日期"
	,payUserNums as "小额套餐付费用户数"
	,contPayUserNums2 as "连续2天充值的用户数"
	,contPayUserNums3 as "连续3天充值的用户数"
	,contPayUserNums4 as "连续4天充值的用户数"
	,contPayUserNums5 as "连续5天充值的用户数"
	,contPayUserNums6 as "连续6天充值的用户数"
	,contPayUserNums7 as "连续7天充值的用户数"
	,rangePayUserNums2 as "2天内有复购的用户数"
	,rangePayUserNums3 as "3天内有复购的用户数"
	,rangePayUserNums4 as "4天内有复购的用户数"
from
(
	select 
		dx_part_date
		,count( dx_account_id) payUserNums
		,count( if(ret2>0,dx_account_id,null)) contPayUserNums2
		,count( if(ret2>0 and ret3>0,dx_account_id,null)) contPayUserNums3
		,count( if(ret2>0 and ret3>0 and ret4>0,dx_account_id,null)) contPayUserNums4
		,count( if(ret2>0 and ret3>0 and ret4>0 and ret5>0,dx_account_id,null)) contPayUserNums5
		,count( if(ret2>0 and ret3>0 and ret4>0 and ret5>0 and ret6>0,dx_account_id,null)) contPayUserNums6
		,count( if(ret2>0 and ret3>0 and ret4>0 and ret5>0 and ret6>0 and ret7>0,dx_account_id,null)) contPayUserNums7
		,count( if(ret2>0,dx_account_id,null)) rangePayUserNums2
		,count( if((ret2+ret3)>0,dx_account_id,null)) rangePayUserNums3
		,count( if((ret2+ret3+ret4)>0,dx_account_id,null)) rangePayUserNums4
	from 
	(
		select dx_part_date,dx_account_id
				,sum(IF(date_diff = 2,1,0)) as ret2
				,sum(IF(date_diff = 3,1,0)) as ret3
				,sum(IF(date_diff = 4,1,0)) as ret4
				,sum(IF(date_diff = 5,1,0)) as ret5
				,sum(IF(date_diff = 6,1,0)) as ret6
				,sum(IF(date_diff = 7,1,0)) as ret7
		from
		(
			select 
				t1.dx_part_date as dx_part_date
				, t1.dx_account_id as dx_account_id
				, DATEDIFF('day', toDate(t1.dx_part_date),toDate(t2.dx_part_date))+1 AS date_diff
			FROM 
			t_pay t1
			left join 
			t_pay t2
			on t1.dx_account_id = t2.dx_account_id 
			where t1.dx_part_date <= t2.dx_part_date
		) ret1
		group by dx_part_date,dx_account_id
	) ret2
	group by dx_part_date
) ret
order by dx_part_date desc	
limit 1000

N+1 备注

1 同普通sql的区别

a. 同一级sql的字段可以继续使用

SELECT b.date day,
parseDateTimeBestEffort(toString(day)) dayDate,
toString(toYear(dayDate)) yearStr,
toMonth(dayDate) monthInt,
toWeek(dayDate) weekInt,
concat(yearStr,'-',IF(10>monthInt,'0',''),toString(monthInt),'月') AS monthStr,
concat(yearStr,'-',IF(10>weekInt,'0',''),toString(weekInt),'周') AS weekStr,
day AS period,

b. full join 不能连接超过2个的子查询

报错：( ... ) a full join ( ... ) b full join ( ... ) c 
可以：( ( ... ) a full join ( ... ) b ) ab left join ( ... ) c

2 ClickHouse分布式建表

v_event本地表

1.建表sql

CREATE TABLE IF NOT EXISTS dx_data.`v_event_*` ON
CLUSTER cluster_4s2r (`dx_event_name` String,
`dx_user_id` String,
`dx_id` String,
`dx_account_id` String,
`dx_distinct_id` String,
`dx_event_time` UInt64,
`dx_part_date` String,
`dx_send_time` UInt64,
`dx_receive_time` UInt64,
`dx_update_time` UInt64,
`dx_ip` String,
`dx_country` String,
`dx_province` String,
`dx_city` String,
`dx_carrier` String,
`dx_uuid` String,
`dx_error_message` String,
`dx_error_column` String,
`op_tag` UInt8 default 1,
`op_version` UInt64) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{layer}-{shard}/v_event_*_1677229078598_cr',
'{replica}',
op_version) PARTITION BY toYYYYMM(toDate(dx_part_date))
ORDER BY
(dx_event_name,
dx_part_date,
dx_event_time,
dx_id,
dx_uuid) SETTINGS index_granularity = 8192

2.建表成后sql

CREATE TABLE dx_data.v_event_*
(

    `dx_event_name` String,

    `dx_user_id` String,

    `dx_id` String,

    `dx_account_id` String,

    `dx_distinct_id` String,

    `dx_event_time` UInt64,

    `dx_part_date` String,

    `dx_send_time` UInt64,

    `dx_receive_time` UInt64,

    `dx_update_time` UInt64,

    `dx_ip` String,

    `dx_country` String,

    `dx_province` String,

    `dx_city` String,

    `dx_carrier` String,

    `dx_uuid` String,

    `dx_error_message` String,

    `dx_error_column` String,

    `op_tag` UInt8 DEFAULT 1,

    `op_version` UInt64
)
ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{layer}-{shard}/v_event_*_1677229078598_cr',
 '{replica}',
 op_version)
PARTITION BY toYYYYMM(toDate(dx_part_date))
ORDER BY (dx_event_name,
 dx_part_date,
 dx_event_time,
 dx_id,
 dx_uuid)
SETTINGS index_granularity = 8192

ds_v_event分布式表

1.建表sql

CREATE TABLE IF NOT EXISTS dx_data.`ds_v_event_*` ON
CLUSTER cluster_4s2r AS dx_data.`v_event_*` ENGINE = Distributed('cluster_4s2r',
'dx_data',
'v_event_*',
rand())

2.建表成后sql

CREATE TABLE dx_data.ds_v_event_*
(

    `dx_event_name` String,

    `dx_user_id` String,

    `dx_id` String,

    `dx_account_id` String,

    `dx_distinct_id` String,

    `dx_event_time` UInt64,

    `dx_part_date` String,

    `dx_send_time` UInt64,

    `dx_receive_time` UInt64,

    `dx_update_time` UInt64,

    `dx_ip` String,

    `dx_country` String,

    `dx_province` String,

    `dx_city` String,

    `dx_carrier` String,

    `dx_uuid` String,

    `dx_error_message` String,

    `dx_error_column` String,

    `op_tag` UInt8 DEFAULT 1,

    `op_version` UInt64
)
ENGINE = Distributed('cluster_4s2r',
 'dx_data',
 'v_event_*',
 rand())

分布式查询注意事项

1 global
a. global in (子查询)
b. 跨库join dc_a.a global left join dc_b.b
c. a global left join (子查询) b

ds_v_event分布式表（aliyun）

ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{shard}/dx_data/dx_data.v_user_1452889326466764800', '{replica}', op_version)
ORDER BY dx_user_id
SETTINGS index_granularity = 8192

CREATE TABLE dc_dwd.ds_dy_operation_role on CLUSTER cluster_emr AS dc_dwd.dy_operation_role 
ENGINE = Distributed(cluster_emr, dc_dwd, dy_operation_role, rand());

3 druid数据源参数设置

druid连接池参数详解:

https://blog.51cto.com/u_11103019/3815976

      clusterclickhouse:
        driver-class-name: ru.yandex.clickhouse.ClickHouseDriver
        url: jdbc:clickhouse://192.168.0.0:8888/dx_data?socket_timeout=60000
        username: aaaa
        password: aaaa
        validationQuery: SELECT 1
        initial-size: 10
        max-active: 50
        # 配置获取连接等待超时的时间
        max-wait: 30000
        # 配置间隔多久才进行一次检测，检测需要关闭的空闲连接，单位是毫秒
        timeBetweenEvictionRunsMillis: 60000
        # 配置一个连接在池中最小生存的时间，单位是毫秒
        minEvictableIdleTimeMillis: 300000
        testWhileIdle: true
        testOnBorrow: false
        testOnReturn: false
        # 打开PSCache，并且指定每个连接上PSCache的大小
        poolPreparedStatements: true
        maxPoolPreparedStatementPerConnectionSize: 20
        removeAbandoned: true
        removeAbandonedTimeout: 1800
        logAbandoned: false

ClickHouse官网

https://clickhouse.tech/docs

demo链接

json函数
https://www.cnblogs.com/MrYang-11-GetKnow/p/15817622.html
数组函数
https://www.cnblogs.com/traditional/p/15226704.html
字符串函数
https://www.cnblogs.com/traditional/p/15234049.html