set hive.auto.convert.join=false
create database dm_attendee;
use dm_attendee;
----------temp start-----------
select
t.userid,
t.eventid,
t.interested,
t.invited,
t.`timestamp`,
e.user_id as event_creator,
e.city as event_city,
e.country as event_country,
e.state as event_state,
e.lat,
e.lng
from
dwd_events.train t inner join dwd_events.events e on t.eventid=e.eventid;
-----------temp end------------------
--from_unixtime(unix_timestamp(concat( substr(e.start_time,1,10) ) ,"yyyy-MM-dd hh:mm:ss")
select
t.userid,e.start_time,t.`timestamp`,
case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
then concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)) else null end start_time_str,
case when t.`timestamp` regexp '\\d{4}-\\d{2}-\\d{2}\\s\\d{2}:\\d{2}:\\d{2}.*'
then substr(t.`timestamp`, 1,19) else null end timestamestr
from
dwd_events.train t inner join dwd_events.events e on t.eventid=e.eventid;
-- "2012-12-14 03:00:00"
select
t.userid,e.start_time,t.`timestamp`,
-- train的用户知道eventid的时间和 eventid 举办的时间 差值 例 2022-1-13知道范范 2025-10-01结婚 中间相差多少天
case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
and
t.`timestamp` regexp '\\d{4}-\\d{2}-\\d{2}\\s\\d{2}:\\d{2}:\\d{2}.*'
then datediff(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)),substr(t.`timestamp`, 1,19))
else null end invite_ahead_days,
-- 事件举办时在什么月份
case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
then month(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)))
else null end event_start_month,
-- 事件举办时是周几
case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
then from_unixtime(unix_timestamp(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8))),'u')
else null end event_start_dayofweek,
-- 事件举办时是几点
case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
then hour(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)))
else null end event_start_hour,
-- 当前统计时的时间到事件举办时相差多少天
case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
then datediff(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)),from_unixtime(unix_timestamp()))
else null end start_ahead_days
from
dwd_events.train t inner join dwd_events.events e on t.eventid=e.eventid;
drop table if exists user_event;
create table user_event stored as orc as
select
t.userid,
t.eventid,
t.interested,
t.invited,
e.user_id as event_creator,
e.city as event_city,
e.country as event_country,
e.state as event_state,
e.lat,
e.lng,
-- train的用户知道eventid的时间和 eventid 举办的时间 差值 例 2022-1-13知道范范 2025-10-01结婚 中间相差多少天
case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
and
t.`timestamp` regexp '\\d{4}-\\d{2}-\\d{2}\\s\\d{2}:\\d{2}:\\d{2}.*'
then datediff(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)),substr(t.`timestamp`, 1,19))
else null end invite_ahead_days,
-- 事件举办时在什么月份
case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
then month(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)))
else null end event_start_month,
-- 事件举办时是周几
case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
then from_unixtime(unix_timestamp(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8))),'u')
else null end event_start_dayofweek,
-- 事件举办时是几点
case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
then hour(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)))
else null end event_start_hour,
-- 当前统计时的时间到事件举办时相差多少天
case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
then datediff(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)),from_unixtime(unix_timestamp()))
else null end start_ahead_days
from
dwd_events.train t left join dwd_events.events e on t.eventid=e.eventid;
select * from dwd_events.train where userid='user' limit 10;
select count(*) from user_event;
-- user_event表 与 user_friend表关联,找出event创建者是不是userid(train)的朋友
-- user_friend_count 关联条件 user_event.userid=user_friend_count.userid
-- gree xiaofan 25/30=83.3
use dm_attendee;
drop table if exists friend_attend_percentage;
create table friend_attend_percentage stored as orc as
with temp1 as (
select
ue.userid,
ue.eventid,
ue.interested,
ue.invited,
ue.event_creator,
ue.event_city,
ue.event_country,
ue.event_state,
ue.lat,
ue.lng,
ue.invite_ahead_days,
ue.event_start_month,
ue.event_start_dayofweek,
ue.event_start_hour,
ue.start_ahead_days,
case when ufc.friendcount is not null then ufc.friendcount else 0 end as user_friend_count,
case when uf.friendid is not null then 1 else 0 end as event_creator_is_friend,
case when fas.invited_friends_count is not null then fas.invited_friends_count else 0 end invited_friends_count,
case when fas.maybe_friends_count is not null then fas.maybe_friends_count else 0 end maybe_friends_count,
case when fas.yes_friends_count is not null then fas.yes_friends_count else 0 end yes_friends_count,
case when fas.no_friends_count is not null then fas.no_friends_count else 0 end no_friends_count
from dm_attendee.user_event ue left join dws_events.user_friend_count ufc on ue.userid=ufc.userid
left join dwd_events.user_friend uf on ue.userid=uf.userid and ue.event_creator=uf.friendid
left join dws_events.friend_attend_summary fas on fas.userid=ue.userid and ue.eventid=fas.eventid
)
select
temp1.userid,
temp1.eventid,
temp1.interested,
temp1.invited,
temp1.event_creator,
temp1.event_city,
temp1.event_country,
temp1.event_state,
temp1.lat,
temp1.lng,
temp1.invite_ahead_days,
temp1.event_start_month,
temp1.event_start_dayofweek,
temp1.event_start_hour,
temp1.start_ahead_days,
temp1.user_friend_count,
temp1.event_creator_is_friend,
temp1.invited_friends_count,
temp1.maybe_friends_count,
temp1.yes_friends_count,
temp1.no_friends_count,
case when temp1.user_friend_count!=0 then temp1.invited_friends_count*100/temp1.user_friend_count else 0 end as invited_friends_percentage,
case when temp1.user_friend_count!=0 then temp1.yes_friends_count*100/temp1.user_friend_count else 0 end as yes_friends_percentage,
case when temp1.user_friend_count!=0 then temp1.no_friends_count*100/temp1.user_friend_count else 0 end as no_friends_percentage,
case when temp1.user_friend_count!=0 then temp1.maybe_friends_count*100/temp1.user_friend_count else 0 end as maybe_friends_percentage
from temp1;
select count(*) from friend_attend_percentage;
--搭建一台虚拟-->.csv->flume-->kafka->kafkastream/sparkstream/flink-->hbase/mongodb(interface)-->hive(数仓库)-->特征值
-->finereport-->自动化部署 ?--》用户是否对某一件事情感兴趣预测
create temporary macro locationSmililar(
user_location string,
event_city string,
event_country string,
event_state string
)
case
when instr(user_location,event_city) >0 or instr(user_location, event_country)>0
or instr(user_location, event_state)>0 then 1 else 0
end;
drop table if exists user_friend_event;
create table user_friend_event stored as orc as
select
fap.userid,
fap.eventid,
cast(fap.interested as int) user_interested,
cast(fap.invited as int) user_invited,
-- fap.event_creator,
case when ec.level is not null then ec.level else 0 end as event_city_level, --fap.event_city,
case when ect.level is not null then ect.level else 0 end as event_country_level, --fap.event_country,
-- fap.event_state,
case when fap.lat is not null then fap.lat else 0 end as event_lat,
case when fap.lng is not null then fap.lng else 0 end as event_lng,
fap.invite_ahead_days,
fap.event_start_month,
cast(fap.event_start_dayofweek as int) event_start_dayofweek,
fap.event_start_hour,
fap.start_ahead_days,
fap.user_friend_count,
fap.event_creator_is_friend,
fap.invited_friends_count,
fap.maybe_friends_count,
fap.yes_friends_count,
fap.no_friends_count,
fap.invited_friends_percentage,
fap.yes_friends_percentage,
fap.no_friends_percentage,
fap.maybe_friends_percentage,
year(from_unixtime(unix_timestamp())) - u.birthyear as user_age,
case when u.gender='male' then 1 when u.gender='female' then 0 else -1 end user_gender,
datediff(from_unixtime(unix_timestamp()), concat(substr(u.joinedat,1,10))) user_member_days,
locationSmililar(lower(u.location),fap.event_city,fap.event_country, fap.event_state) location_similar,
case when uec.event_count is not null then uec.event_count else 0 end as user_had_event_count,
case when uaec.invited_count is not null then uaec.invited_count else 0 end as user_invited_event_count,
case when uaec.maybe_attend_count is not null then uaec.maybe_attend_count else 0 end as user_maybe_attend_event_count,
case when uaec.no_attend_count is not null then uaec.no_attend_count else 0 end as user_no_attend_event_count,
case when uaec.yes_attend_count is not null then uaec.yes_attend_count else 0 end as user_yes_attend_event_count,
case when l.local_id is not null then l.local_id else 0 end user_locale,
case when tz.time_zone_id is not null then tz.time_zone_id else 0 end user_time_zone,
case when inv.attendcount is not null then inv.attendcount else 0 end event_invited_user_count,
case when yes.attendcount is not null then yes.attendcount else 0 end event_yes_user_count,
case when no.attendcount is not null then no.attendcount else 0 end event_no_user_count,
case when maybe.attendcount is not null then maybe.attendcount else 0 end event_maybe_user_count
from friend_attend_percentage fap
inner join dwd_events.users u on fap.userid=u.userid
left join dws_events.event_cities ec on fap.event_city=ec.city
left join dws_events.event_countries ect on fap.event_country=ect.country
left join dws_events.user_event_count uec on uec.userid=fap.userid
left join dws_events.user_attend_event_count uaec on uaec.userid=fap.userid
left join dwd_events.`local` l on l.locale=u.locale
left join dwd_events.time_zone tz on tz.time_zone=u.timezone
left join dws_events.event_attendee_count inv on inv.eventid=fap.eventid and inv.state='invited'
left join dws_events.event_attendee_count yes on yes.eventid=fap.eventid and yes.state='yes'
left join dws_events.event_attendee_count no on no.eventid=fap.eventid and no.state='no'
left join dws_events.event_attendee_count maybe on maybe.eventid=fap.eventid and maybe.state='maybe'
;
select count(*) from user_friend_event;
use dm_attendee;
select * from user_friend_event;
select count(*) from user_friend_event;
desc user_friend_event;
版权声明:本文为liuyongsheng666原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。