1 聚合为数组
什么是数组 如[800,2975,3000,1100,3000]
select collect_list(sal) from emp;
2 将数组以任意格式聚合
2.1 有重复
select concat_ws(',',collect_list(area_manager_name))
2.2 没有重复
select concat_ws(',',collect_set(area_manager_name))
2.3 一行转多行
--1、split 用逗号分隔 返回数组 2、explode 输入数组 变多行
select
...
FROM
nczbigdata.ods_src_membercenter_ncz_member_card
LATERAL VIEW OUTER explode(split(car_brand_no,',')) t AS car_brand_no
WHERE
pt = '${thedate}'
2.4 多行转多列
select
mid,
kv2['10000029'] as gb_acc_name , --变速箱产品名称 根据attr_id筛选
kv1['51'] as gb_no , --变速箱型号
kv1['51'] as gb_sub_no, --变速箱子型号
kv1['52'] as company, --变速箱厂家
from (SELECT
mid
,str_to_map(concat_ws(',', collect_set(concat(attr_id, '-', attr_value)))
,','
,'-'
) kv1
XXXX
) t1
3 连续登陆
3.1 方案一
SELECT
t.use_id,
DATE_SUB(t.login_date,INTERVAL t.rn DAY) as date,
COUNT(1) as counts
FROM
(SELECT
use_id,
login_date,
row_number()over(partition by use_id order by login_date ) as rn
FROM
test
) t
GROUP BY t.use_id,DATE_SUB(t.login_date,INTERVAL t.rn DAY)
HAVING COUNT(1)>=3
3.2 方案二
SELECT
uuid
,COUNT(rn1-rn)
FROM (
SELECT uuid
,datediff(
concat(DATE,' 00:00:00')
,concat(min_date,' 00:00:00')
,'dd'
)+1 AS rn1
,rn
FROM (
SELECT
a.uuid
,ROW_NUMBER() OVER ( PARTITION BY uuid ORDER BY DATE ) AS rn
,min(DATE) OVER ( PARTITION BY uuid ) AS min_date
,DATE
FROM a
)
)
GROUP BY uuid
HAVING COUNT(1) > 7
4 次日留存
with a as
(
select create_date,user_id,last_create_date
from (
SELECT substr(create_time,1,10) AS create_date
,user_id
,lead(substr(create_time,1,10), 1) over (partition by user_id order by substr(create_time,1,10)) as next_create_date
FROM nczbigdata.dws_trade_orders a
WHERE pt = 20220207
)
group by create_date,user_id,last_create_date
)
SELECT *
FROM a t1
WHERE datediff(concat(next_create_date,' 00:00:00'),concat(t1.create_date,' 00:00:00'),'dd') = 1
5 互相关注
with a as (
select 1 as to_user,2 form_user
union all
select 1 as to_user,3 form_user
union all
select 1 as to_user,4 form_user
union all
select 2 as to_user,1 form_user
union all
select 2 as to_user,3 form_user
union all
select 2 as to_user,4 form_user
union all
select 3 as to_user,1 form_user
union all
select 3 as to_user,2 form_user
union all
select 3 as to_user,4 form_user
)
select
value
,count(1)
from (
select
to_user
,form_user
,if(to_user>form_user,CONCAT(to_user,'-',form_user),CONCAT(form_user,'-',to_user)) as value
from a
)
group by value
having count(1)>1
6 先进先出 库龄计算
select pid,date,qty,sum(case when qty<0 then qty else 0 end)over (partition by pid order by date),
sum(case when qty>0 then qty else 0 end) over (partition by pid order by date),
sum(case when qty<0 then qty else 0 end)over (partition by pid order by date)+
sum(case when qty>0 then qty else 0 end) over (partition by pid order by date)
from p_dw
7 统计2021年国庆头3天每类视频每天的近一周总点赞量和一周内最大单天转发量
select *
from (select
tag,
dt,
sum(sum_like_cnt) over (
partition by tag
order by
dt rows between 6 preceding
and current row
) as sum_like_cnt_7d,
max(retweet_cnt_7d) over (
partition by tag
order by
dt rows between 6 preceding
and current row
) as max_retweet_cnt_7d
from
(
select
b.tag,
date_format(start_time, '%Y-%m-%d') as dt,
sum(if_like) as sum_like_cnt,
sum(if_retweet) as retweet_cnt_7d
from
tb_user_video_log a
left join tb_video_info b on a.video_id = b.video_id
group by
b.tag,
date_format(start_time, '%Y-%m-%d')
) t1
) t2 where dt between '2021-10-01' and '2021-10-03'
order by tag desc,dt
8 每篇文章同一时刻最大在看人数
select
artical_id,
max(cnt)
from
(
select
artical_id,
sum(mark) over (
order by
times,
mark desc
) as cnt
from
(
select
artical_id,
uid,
in_time as times,
1 as mark
from
tb_user_log
where
artical_id <> 0
union all
select
artical_id,
uid,
out_time as times,
-1 as mark
from
tb_user_log
where
artical_id <> 0
) a
) b
group by
artical_id
order by
2 desc
9 每天新用户的次日留存率
select
dt,
case when count(distinct uid)=0 then 0 else round(count(
distinct case
when datediff(next_date, dt) = 1 then uid
end
) /count(distinct uid),2) end as rate
from
(
select
uid,
substr(in_time, 1, 10) as dt,
lead(substr(in_time, 1, 10), 1) over (partition by uid) as next_date,
lag(substr(in_time, 1, 10), 1) over (partition by uid) as last_date
from
tb_user_log
) t1
where
last_date is null and dt>='2021-11-01'
group by
dt order by 1
10 连续签到领金币
解题思路:
1、根据用户id 和日期(登陆日期-排序rn)
2、
• 当签到天数%7=3 则领取3金币
• 当签到天数%7=0 则领取7金币
• 其余情况,领取1金币
select
uid,
substr(in_date, 1, 6),
sum(
case
when pmod(days, 7) = 3 then 3
when pmod(days, 7) = 0 then 7
else 1
end
)
from
(
select
uid,
row_number() over (
partition by uid,
daydiff
order by
in_date
) as days,
in_date
from
(
select
uid,
in_date,
date(in_date) - rn as daydiff,
rn
from
(
select
uid,
in_date,
row_number() over (
partition by uid
order by
in_date
) as rn
from
(
select
uid,
date_format(in_time, '%Y%m%d') as in_date
from
tb_user_log a
where
artical_id = 0
and sign_in = 1
and substr(in_time, 1, 10) between '2021-07-07' and '2021-10-31'
group by
uid,
date_format(in_time, '%Y%m%d')
) t1
) t2
) t3
) t4
group by
uid,
substr(in_date, 1, 6)
order by 1,2 asc
11 某乎问答最大连续回答问题天数大于等于3天的用户及其对应等级
select
t4.author_id,
author_level,
days_cnt
from
(
select
author_id,
diff_date,
count(1) as days_cnt
from
(
select
answer_date,
author_id,
row_number() over (
partition by author_id
order by
answer_date
) as rn,
date_format(
answer_date -1 * row_number() over (
partition by author_id
order by
answer_date
),
'%Y-%m-%d'
) as diff_date
from
(
select
answer_date,
author_id
from
answer_tb a
group by
answer_date,
author_id
) t1
) t2
group by
author_id,
diff_date
having
count(1) >= 3
) t4
left join author_tb t3 on t4.author_id = t3.author_id
order by
1
12 牛客直播开始时各直播间在线人数
select
a.course_id,b.course_name,count(distinct user_id) as online_num
from attend_tb a
left join course_tb b on a.course_id=b.course_id
WHERE date_format(in_datetime,'%H:%i')<='19:00' AND date_format(OUT_datetime,'%H:%i')>'19:00'
group by a.course_id,b.course_name
13 行列转换
描述:表中记录了各年份各部门的平均绩效考核成绩。
表名:t1
表结构:
a – 年份
b – 部门
c – 绩效得分
表内容:
a b c
2014 B 9
2015 A 8
2014 A 10
2015 B 7
13.1 多行转多列
问题描述:将上述表内容转为如下输出结果所示:
a col_A col_B
2014 10 9
2015 8 7
参考答案:
select
a,
max(case when b="A" then c end) col_A,
max(case when b="B" then c end) col_B
from t1 group by a;
13.2 将问题一的结果转成源表,问题一结果表名为 t1_2(多列转多行)
参考答案:
select
a,
b, c
from (
select a,"A" as b,col_a as c from t1_2 union all
select a,"B" as b,col_b as c from t1_2
)tmp;
14 student course score 找出哪些学生课程比平均课程成绩高
select student
(
select student,course ,score ,avg(score ) over (partition by course) as avg_score
from score
) t1 where score>avg_score
15 一张1-180天注册用户 活跃留存表
现有一个用户活跃表user_active、用户注册表user_regist,表中分区字段都为p_date(yyyy-MM-dd),用户字段均为user_id;设计一张1-180天注册用户 活跃留存表;
表:user_active
字段:user_id,p_date
表:user_regist
字段:user_id,p_date
select regist_date
,date_diff
,user_count/regist_count as rate
from (
select t1.regist_date
,max(t1.regist_count) as regist_count
,datediff(t2.active_date, t1.regist_date) as date_diff
,count(*) as user_count
from (
select user_id
,regist_date
,count(*) over(partition by regist_date) as regist_count
from user_regist
where dt >= date_sub(current_date(), 180)
) t1
left join (
select user_id
,to_date(active_date) as active_date
from user_active
where dt >= date_sub(current_date(), 180)
group by user_id, to_date(active_date)
) t2 on t1.user_id = t2.user_id
where datediff(t2.active_date, t1.regist_date) >=1
and datediff(t2.active_date, t1.regist_date) <= 180
group by t1.regist_date, datediff(t2.active_date, t1.regist_date)
) t
16 两个日期间所有日期
--POSEXPLODE 取index和value
SELECT t2.start_date
,t2.end_date
,x
,y
,DATE_ADD(start_date,x) AS date
FROM (
SELECT start_date
,end_date
,DATEDIFF(end_date,start_date) AS diff
,SPLIT(replace(SPACE(DATEDIFF(end_date,start_date,'dd')),'',',1'),',') AS day_num
FROM (
SELECT '2022-06-01' AS start_date
,'2022-06-05' AS end_date
) t1
) t2
LATERAL VIEW POSEXPLODE(day_num) t3 AS x,y
17 最大连续登陆天数,间隔N天也算
WITH game_user AS
(
SELECT 1001 AS id
,'2022-05-01' AS dt
UNION ALL
SELECT 1001 AS id
,'2022-05-03' AS dt
UNION ALL
SELECT 1001 AS id
,'2022-05-05' AS dt
UNION ALL
SELECT 1001 AS id
,'2022-05-06' AS dt
UNION ALL
SELECT 1001 AS id
,'2022-05-07' AS dt
UNION ALL
SELECT 1002 AS id
,'2022-05-01' AS dt
UNION ALL
SELECT 1002 AS id
,'2022-05-04' AS dt
UNION ALL
SELECT 1002 AS id
,'2022-05-05' AS dt
)
SELECT id
,MAX(days) + 1
FROM (
SELECT id
,flag
,DATEDIFF(MAX(dt),MIN(dt)) days
FROM (
SELECT id
,dt
---如果间隔超过一天,则flag+1,flag从最小的日期开始累积,达到分组的作用
,SUM(IF(flag > 2,1,0)) OVER (PARTITION BY id ORDER BY dt ) flag
FROM (
SELECT id
,dt
,DATEDIFF(dt,lagdt) flag
FROM (
SELECT id
,dt
,LAG(dt,1,'1970-01-01') OVER (PARTITION BY id ORDER BY dt ) lagdt
FROM game_user
) t1
) t2
) t3
GROUP BY id
,flag
) t4
GROUP BY id
18 某个用户连续的访问记录如果时间间隔小于60秒,则视为一个组
select
id,ts,
sum(if(diff_ts>60,1,0)) over(partition by id order by ts)
from
(select
id,ts,ts-lag_ts as diff_ts
from
(select
id,ts,
lag(ts,1,0) over(partition by id order by ts ) as lag_ts,--找不到上一条,置0
from tmp.table_test2
) t1
)t2;
19 打折日期交叉问题
如下为平台商品促销数据:字段为品牌,打折开始日期,打折结束日期,计算每个品牌总的打折销售天数,注意其中的交叉日期,比如vivo品牌,第一次活动时间为2021-06-05到2021-06-15,第二次活动时间为2021-06-09到2021-06-21其中9号到15号为重复天数,只统计一次,即vivo总打折天数为2021-06-05到2021-06-21共计17天。
id | stt | edt |
---|---|---|
oppo | 2021-06-05 | 2021-06-09 |
oppo | 2021-06-11 | 2021-06-21 |
vivo | 2021-06-05 | 2021-06-15 |
vivo | 2021-06-09 | 2021-06-21 |
redmi | 2021-06-05 | 2021-06-21 |
redmi | 2021-06-09 | 2021-06-15 |
redmi | 2021-06-17 | 2021-06-26 |
huawei | 2021-06-05 | 2021-06-26 |
huawei | 2021-06-09 | 2021-06-15 |
huawei | 2021-06-17 | 2021-06-21 |
1 获取当前行以前的数据最大的结束时间放在当前行
2 比较开始时间和下移的数据 如果开始时间大 则无需操作 反之需要移动下来的数据加1后替换当前行的开始时间 第一行数据无需替换
3 计算开始时间与结束时间的差值
4 按照品牌进行分组 计算每条真是数据加1的总和
select
id,
sum(days>0,days+1,0) days---会有负数,负数的是已经被上一条记录覆盖了,不累加
from
(
select
id,
datediff(edt,stt) days
from(
select
id,
if(maxEdt is null,stt,if(stt>maxEdt,stt,data_add(maxEdt,1))) stt,
edt
from
(
select
id,
stt,
edt,
max(edt) over(partition by id order by stt rows betwwen UNBOUNDED PRECEDING and 1PRECEDING) maxEdt
from test4
)t1
)t2
)t3
group by id
20 最多连胜的次数
输入:
Matches 表:
±----------±-----------±-------+
| player_id | match_day | result |
±----------±-----------±-------+
| 1 | 2022-01-17 | Win |
| 1 | 2022-01-18 | Win |
| 1 | 2022-01-25 | Win |
| 1 | 2022-01-31 | Draw |
| 1 | 2022-02-08 | Win |
| 2 | 2022-02-06 | Lose |
| 2 | 2022-02-08 | Lose |
| 3 | 2022-03-30 | Win |
±----------±-----------±-------+
输出:
±----------±---------------+
| player_id | longest_streak |
±----------±---------------+
| 1 | 3 |
| 2 | 0 |
| 3 | 1 |
±----------±---------------+
需要注意胜一次算连胜吗?
select
player_id,
max(cnt) as cnt
from
(
select
player_id,
dt_num-num2,
sum(if_win) as cnt
from
(
select
player_id,
if(result='win','1','0') as if_win,
row_number () over (partition by player_id order by match_day) as dt_num,
row_number () over (partition by player_id, result order by match_day) as num2
from
matches
) t
group by
player_id,
dt_num-num2,
) t
group by
player_id;
21 求最大连胜天数
----这一题 是求最大连胜天数
select
player_id,
max(cnt) as cnt
from
(
select
player_id,
dt_num-num2,
sum(if_win) as cnt
from
(
select
player_id,
if(result='win','1','0') as if_win,
datediff(match_day,'1997-01-01') as dt_num,
row_number () over (partition by player_id, result order by match_day) as num2
from
matches
) t
group by
player_id,
dt_num-num2,
) t
group by
player_id;
22 AB球队得分流水表,得到连续三次得分的队员名字和每次赶超对手的球员名字
问题:两支篮球队进行了激烈的篮球比赛,比分交替上升。比赛结束后,你有一张两队得分分数的明细表,记录了球队team,球员号码number,球员姓名name, 得分分数score 以及得分时间scoretime(datetime)。现在球队要对比赛中表现突出的球员做出嘉奖,所以请你用sql统计出连续三次得分
方法1:
select distinct a.name ,a.team from
(
select *,lead(name,1) over(partition by team order by score_time) as ld1
,lead(name,2) over(partition by team order by score_time) as ld2
,lag(name,1) over(partition by team order by score_time) as lg1
,lag(name,2) over(partition by team order by score_time) as lg2
from table
) a
where (a.name =a.ld1 and a.name =a.ld2)
or (a.name =a.ld1 and a.name =a.lg1)
or (a.name=a.lg1 and a.name=a.lg2)
方法2:
-- 1.按team分组,按score_time 正向排序
-- 2.获取当前行的前一行 name
-- 3.判断当前行的name 是否与前一行的name是否相同,添加标记 不同为1 相同为0
-- 4.对标记累计求和,作为连续得分分组
-- 5.对分区计数,得出 连续得分次数
select
name
,cont_group
,count(name) as contin_cnt
from (
select
team
,number
,score_time
,score
,name
,pre_name
,if_contin
-- 累计求和,获取连续分组
,sum(if_contin) over (partition by team order by score_time asc) as cont_group
from (
select
team
,number
,score_time
,score
,name
-- 获取当前行的前一行的 name
,lag(name) over (partition by team order by score_time asc) as pre_name
-- 判断 pre_name 和name 是否相同
,if(lag(name) over (partition by team order by score_time asc) = name
,0,1
) as if_contin
from bktab
) t1
) t2
group by name
,cont_group
-- 通过这里限制 连续得分次数
having count(name) >= 3
;
反超对方球队的队员名称
select
team
,number
,score_time
,score
,name
,ateam_score
,bteam_score
from (
select
team
,number
,score_time
,score
,name
,ateam_score
,bteam_score
,diff_score
,lag(diff_score) over (order by score_time asc) as pre_diff_score
,case when diff_score > 0 and lag(diff_score) over (order by score_time asc) < 0 then 1--A>B且上一条记录B>A
when diff_score < 0 and lag(diff_score) over (order by score_time asc) > 0 then 1--A<B且上一条记录B<A
when diff_score is not null and lag(diff_score) over (order by score_time asc) is null then 1--第一个球 比是符合条件的
else 0
end as if_surpass
from (
select
team
,number
,score_time
,score
,name
,sum(if(team = 'A',score,0)) over (order by score_time asc) as ateam_score
,sum(if(team = 'B',score,0)) over (order by score_time asc) as bteam_score
,sum(if(team = 'A',score,0)) over (order by score_time asc) - sum(if(team = 'B',score,0)) over (order by score_time asc) as diff_score
from bktab
) t1
) t2
where if_surpass = 1
23 SQL实现次日、三日及七日用户留存率的计算
23.1、方案一
注意role_login_back 表去重
select
log_day '日期',
count(user_id_d0) '新增数量',
count(user_id_d1) / count(user_id_d0) '次日留存率',
count(user_id_d3) / count(user_id_d0) '3日留存率',
count(user_id_d7) / count(user_id_d0) '7日留存率',
from (
select
distinct log_day,
a.user_id_d0,
b.device_id as user_id_d1,
c.device_id as user_id_d3,
d.device_id as user_id_d7
from
(select
distinct date(event_time) as log_day, # 只关心日期,不关注具体的时间。
device_id as user_id_d0
from role_login_back
group by device_id
order by log_day) a
left join role_login_back b
on datediff(date(b.event_time),a.log_day) = 1
and a.user_id_d0 = b.device_id
left join role_login_back c
on datediff(date(c.event_time), a.log_day) = 2
and a.user_id_d0 = c.device_id
left join role_login_back d
on datediff(date(d.event_time), a.log_day) = 6
and a.user_id_d0 = d.device_id
)
group by log_day;
23.2、方案二
SELECT COUNT(CASE WHEN DATEDIFF(next1_date,date) = 2
OR DATEDIFF(next2_date,date) = 2 THEN user_id END) / COUNT(user_id)
,COUNT(CASE WHEN DATEDIFF(next1_date,date) = 4
OR DATEDIFF(next2_date,date) = 4
OR DATEDIFF(next3_date,date) = 4
OR DATEDIFF(next4_date,date) = 4 THEN user_id END) / COUNT(user_id)
FROM (
SELECT uid
,date
,LEAD(date,1) OVER (PARTITION BY uid ORDER BY DATE ) AS next1_date
,LEAD(date,2) OVER (PARTITION BY uid ORDER BY DATE ) AS next2_date
,LEAD(date,3) OVER (PARTITION BY uid ORDER BY DATE ) AS next3_date
,LEAD(date,4) OVER (PARTITION BY uid ORDER BY DATE ) AS next4_date
FROM (
SELECT uid
,date
FROM t1
GROUP BY uid
,date
) a
) b
WHERE DATE = '2022-05-01'