文章目录
- 小练习一
- 练习二:连续时间区间合并
- 练习三:间隔连续问题
- 练习四: 打折日期交叉问题
小练习一
- tableA 存储了一所大学所有人的所有科目的考试成绩,共4列:学号 ( stu_num ),考试科目(subject),考试成绩( score )等级 ( level )(共SABCD五个值),考试时间 ( time ),
- 求:
第一次考试的考试成绩等级为A,其它考试成绩等级都为B的学号。
select stu_num
from (select stu_num,part_cnt,sum(case when t.rank=1 and t.level='A' then 1 else flag_b end ) as flag_bfrom (selectstu_num,level,case when level='B' then 1 else 0 as flag_b,sum(stu_num) over (partition by stu_num) as part_cnt,row_num()over(partition by stu_num order by time ) as rankfrom tableA)tgroup by stu_num,part_cnt)t2
where t2.flag_b=t2.part_cnt
;
练习二:连续时间区间合并
- 需求:连续时间区间合并
- user_id 用户id
- location_id 地点
- start_time 停留开始时间(时间)
- stop_mint 停留时间(分钟)
select *
from tmpdb.user_location
;
- 思路
- step1:打标签is_first,该标签为向后开始合并的第一条
- step2:开窗聚合标签,聚合出来group_i,即这些数据是需要合并的一组
- step3:取到各组的最小开始时间,及最大结束时间,即为合并区间的起始和结束
- 答案解析
select user_id,location_id,start_time,(unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') - unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss'))/60 as stop_mint
from
(select user_id,location_id,grp,min(start_time) as start_time,max(end_time) as end_timefrom(select user_id,location_id,start_time,end_time,sum(is_first)over(partition by user_id,location_id order by start_time) as grpfrom(select user_id,location_id,start_time,end_time,case when start_time > lag_end_time then 1 else 0 end is_firstfrom(select user_id,location_id,start_time,end_time,lag(end_time,1,0)over(partition by user_id,location_id order by start_time) as lag_end_time ,lead(start_time,1,0)over(partition by user_id,location_id order by start_time) as lead_start_time from(select user_id,location_id,from_unixtime(unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') ,'yyyy-MM-dd HH:mm:ss') as start_time ,from_unixtime( unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') + stop_mint*60 ,'yyyy-MM-dd HH:mm:ss') as end_time from tmpdb.user_location) time_fix ) time_fix_lag_lead) pre_grp_flag) pre_grpgroup by user_id,location_id,grp
) grp
;
- 详细拆分(看懂上面的下面的请忽略,这里为了方便理解,分的步骤贼散)
with time_fix as
(
select user_id ,location_id,from_unixtime(unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') ,'yyyy-MM-dd HH:mm:ss') as start_time ,from_unixtime( unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') + stop_mint*60 , 'yyyy-MM-dd HH:mm:ss') as end_time
from tmpdb.user_location
)
, time_fix_lag_lead as
(
select user_id ,location_id,start_time,end_time,lag(end_time,1,0)over(partition by user_id,location_id order by start_time) as lag_end_time ,lead(start_time,1,0)over(partition by user_id,location_id order by start_time) as lead_start_time
from time_fix
)
,pre_grp_flag as
(
select user_id ,location_id ,start_time,end_time,case when start_time > lag_end_time then 1 else 0 end is_first
from time_fix_lag_lead
)
,pre_grp as
(
select user_id ,location_id ,start_time,end_time,sum(is_first)over(partition by user_id,location_id order by start_time) as grp
from pre_grp_flag
)
,grp as
(
select user_id,location_id,grp,min(start_time) as start_time,max(end_time) as end_time
from pre_grp
group by user_id,location_id,grp
)
select user_id ,location_id ,start_time,(unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') - unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss'))/60 as stop_mint
from grp
;
练习三:间隔连续问题
- 需求:计算每个用户最大的连续登录天数,可以间隔一天。解释:如果一个用户在 1,3,5,6 登录游戏,则视为连续 6 天登录。
- 结果 1001 5天 1002 2天
- 准备数据
create table tmpdb.hanjiaxiaozhi as select 1001 as id , '2021-12-12' as dt
union all select 1002 as id , '2021-12-12' as dt
union all select 1001 as id , '2021-12-13' as dt
union all select 1001 as id , '2021-12-14' as dt
union all select 1001 as id , '2021-12-16' as dt
union all select 1002 as id , '2021-12-16' as dt
union all select 1001 as id , '2021-12-19' as dt
union all select 1002 as id , '2021-12-17' as dt
union all select 1001 as id , '2021-12-20' as dt
;
select *
from tmpdb.hanjiaxiaozhi
;
select id,max(max_login) as max_login
from
(select id,grp_id,datediff(max(dt),min(dt))+1 as max_loginfrom(select id,dt,lag_dt,sum(if(datediff(dt,lag_dt) > 2,1,0)) over (partition by id order by dt) as grp_idfrom(select id,dt,lag(dt,1,'1970-01-01') over(partition by id order by dt) as lag_dtfrom tmpdb.hanjiaxiaozhi) pre )pre_3 group by id,grp_id
)t
group by id
;
- 详细拆分(看懂上面的下面的请忽略,这里为了方便理解,分的步骤贼散)
with pre as
(selectid,dt,lag(dt,1,'1970-01-01') over(partition by id order by dt) as lag_dt
from tmpdb.hanjiaxiaozhi
)
,pre_2 as
(
select id ,dt,lag_dt,if(datediff(dt,lag_dt)<=2,0,1) as is_first
from pre
)
,pre_3 as
(
select id,dt,lag_dt,is_first,sum(is_first) over (partition by id order by dt) as grp_id
from pre_2
)
selectid,max(max_login) as max_login
from(selectid,grp_id,datediff(max(dt),min(dt))+1 as max_loginfrom pre_3group by id,grp_id)t
group by id
;
练习四: 打折日期交叉问题
- 需求
- 如下为平台商品促销数据:字段为品牌,打折开始日期,打折结束日期
- 计算每个品牌总的打折销售天数,注意其中的交叉日期,比如 vivo 品牌,第一次活动时间为 2021-06-05 到 2021-06-15,第二次活动时间为 2021-06-09 到 2021-06-21 其中 9 号到 15号为重复天数,只统计一次,即 vivo 总打折天数为 2021-06-05 到 2021-06-21 共计 17 天。
- 准备数据
create table py_tmpdb.hanjiaxiaozhi asselect 'oppo' as brand, '2021-06-05' as stt , '2021-06-09' as edt
union all select 'oppo' as brand, '2021-06-11' as stt , '2021-06-21' as edt
union all select 'vivo' as brand, '2021-06-05' as stt , '2021-06-15' as edt
union all select 'vivo' as brand, '2021-06-09' as stt , '2021-06-21' as edt
union all select 'redmi' as brand, '2021-06-05' as stt , '2021-06-21' as edt
union all select 'redmi' as brand, '2021-06-09' as stt , '2021-06-15' as edt
union all select 'redmi' as brand, '2021-06-17' as stt , '2021-06-26' as edt
union all select 'huawei' as brand, '2021-06-05' as stt , '2021-06-26' as edt
union all select 'huawei' as brand, '2021-06-09' as stt , '2021-06-15' as edt
union all select 'huawei' as brand, '2021-06-17' as stt , '2021-06-21' as edt
;
select *
from py_tmpdb.hanjiaxiaozhi
brand dur_days
huawei 22
oppo 16
redmi 22
vivo 17
- 思路
- step1:取到本条之前 最大的结束时间
- step2:如果stt大于之前最大的edt,不变,否则 把stt+1置为上一条的结束时间
- step3:datediff(edt,stt),会出现负的,这些数据下一步去掉
- step4:如果dur_days为负数,那么+0,否则加dur_days+1
- 答案解析
select brand,sum(if(dur_days > 0,dur_days+1,0)) as dur_days
from
(select brand,stt,edt,datediff(edt,stt) as dur_daysfrom(select brand,if(max_edt is null,stt,if(stt > max_edt,stt,date_add(max_edt,1))) as stt,edtfrom(select brand,stt,edt,max(edt)over(partition by brand order by stt rows between unbounded preceding and 1 preceding) as max_edtfrom py_tmpdb.hanjiaxiaozhi) step_1 ) step_2
) step_3
group by brand
;
- 详细拆分(看懂上面的下面的请忽略,这里为了方便理解,分的步骤贼散)
with step_1 as
(
selectbrand,stt,edt,max(edt)over(partition by brand order by stt rows between unbounded preceding and 1 preceding) as max_edt
from py_tmpdb.hanjiaxiaozhi
)
,step_2 as
(
select brand,if(max_edt is null,stt,if(stt>max_edt,stt,date_add(max_edt,1))) as stt,edt
from step_1
)
,step_3 as
(
selectbrand,stt,edt,datediff(edt,stt) as dur_days
from step_2
)
selectbrand,sum(if(dur_days>0,dur_days+1,0)) as dur_days
from step_3
group by brand
;