1、
–dws层经过处理后的数据
select
table1.user_id as user_id ,
table1.read_book_id as arrays
--arrays字段在dws_qy_profile_reads_stat ,中该字段为 array[]类型的数据
from
dws_reads table1 ;
分析历史数据中的具体某一条数据
2、
–dwd_qy_profile_cpsshardn_user_recently_read表提供增量数据数据来源
–统计每天的数据按照user_id字段进行聚合收集 book_id字段,
–收集后的数据集放在arrays字段中
select
table2.user_id as user_id,
collect_set(table2.book_id) as arrays
from
dwd_recently_read table2
where
year = '2022' --${year}
and
month = '03' --${month}
and
day = '02' --${day}
group by
table2.user_id ;
增量数据中的具体某一条
3、
–统计用户阅读过的书籍
–让历史数据跟增量数据进行整合
–先将两个数组字段中的集合值进行炸开(与视图配合使用)
select
temp.user_id as user_id,
adid
from (
select
table1.user_id as user_id ,
table1.read_book_id as arrays
from
dws_reads table1
union all
select
table2.user_id as user_id,
collect_set(table2.book_id) as arrays
from
dwd_recently_read table2
where
year = '2022' --${year}
and
month = '03' --${month}
and
day = '02' --${day}
group by
table2.user_id
) temp LATERAL VIEW explode(temp.arrays) adTable AS adid
--adTable 视图别名 ,adid 新列名
炸开后的结果:
"10072771" "10072771" "10072771" "10072771"
4、
–然后在按照用户进行聚合
select
tt.user_id,
--tt.adid,
collect_set(tt.adid)
from
(
select
temp.user_id as user_id,
adid
from (
select
table1.user_id as user_id ,
table1.read_book_id as arrays
from
dws_reads table1
union all
select
table2.user_id as user_id,
collect_set(table2.book_id) as arrays
from
dwd_recently_read table2
where
year = '2022' --${year}
and
month = '03' --${month}
and
day = '02' --${day}
group by
table2.user_id
) temp LATERAL VIEW explode(temp.arrays) adTable AS adid
--adTable 视图别名 ,adid 新列名
)tt
group by user_id; --24条数据
聚合去重后的结果:(这里是由于巧合,历史数据跟新增数据完全一样,如果新增数据跟历史数据不一样,聚合去重后的数据,数组中就会新增元素)
版权声明:本文为weixin_46609492原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。