离线Hive数据仓库学习 dm_attendee

  • Post author:
  • Post category:其他


set hive.auto.convert.join=false

create database dm_attendee;

use dm_attendee;

----------temp start-----------
select
       t.userid,
       t.eventid,
       t.interested,
       t.invited,
       t.`timestamp`,
       e.user_id as event_creator,
       e.city as event_city,
       e.country as event_country,
       e.state as event_state,
       e.lat,
       e.lng
from
     dwd_events.train t inner join dwd_events.events e on t.eventid=e.eventid;
-----------temp end------------------
--from_unixtime(unix_timestamp(concat( substr(e.start_time,1,10) )  ,"yyyy-MM-dd hh:mm:ss")
select
    t.userid,e.start_time,t.`timestamp`,
    case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
        then concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)) else null end start_time_str,
    case when t.`timestamp` regexp '\\d{4}-\\d{2}-\\d{2}\\s\\d{2}:\\d{2}:\\d{2}.*'
        then substr(t.`timestamp`, 1,19) else null end timestamestr
from
    dwd_events.train t inner join dwd_events.events e on t.eventid=e.eventid;

--                        "2012-12-14 03:00:00"

select
    t.userid,e.start_time,t.`timestamp`,
    -- train的用户知道eventid的时间和 eventid 举办的时间 差值  例  2022-1-13知道范范 2025-10-01结婚  中间相差多少天
    case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
                  and
              t.`timestamp` regexp '\\d{4}-\\d{2}-\\d{2}\\s\\d{2}:\\d{2}:\\d{2}.*'
        then datediff(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)),substr(t.`timestamp`, 1,19))
        else null end invite_ahead_days,
    -- 事件举办时在什么月份
    case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
        then month(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)))
            else null end event_start_month,
    -- 事件举办时是周几
    case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
        then from_unixtime(unix_timestamp(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8))),'u')
        else null end event_start_dayofweek,
    -- 事件举办时是几点
    case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
        then hour(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)))
        else null end event_start_hour,
    -- 当前统计时的时间到事件举办时相差多少天
    case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
        then datediff(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)),from_unixtime(unix_timestamp()))
        else null end start_ahead_days
from
    dwd_events.train t inner join dwd_events.events e on t.eventid=e.eventid;


drop table if exists user_event;
create table user_event stored as orc as
select
       t.userid,
       t.eventid,
       t.interested,
       t.invited,
       e.user_id as event_creator,
       e.city as event_city,
       e.country as event_country,
       e.state as event_state,
       e.lat,
       e.lng,
       -- train的用户知道eventid的时间和 eventid 举办的时间 差值  例  2022-1-13知道范范 2025-10-01结婚  中间相差多少天
    case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
                  and
              t.`timestamp` regexp '\\d{4}-\\d{2}-\\d{2}\\s\\d{2}:\\d{2}:\\d{2}.*'
        then datediff(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)),substr(t.`timestamp`, 1,19))
        else null end invite_ahead_days,
    -- 事件举办时在什么月份
    case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
        then month(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)))
            else null end event_start_month,
    -- 事件举办时是周几
    case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
        then from_unixtime(unix_timestamp(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8))),'u')
        else null end event_start_dayofweek,
    -- 事件举办时是几点
    case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
        then hour(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)))
        else null end event_start_hour,
    -- 当前统计时的时间到事件举办时相差多少天
    case when e.start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}Z'
        then datediff(concat(substr(e.start_time,1,10),' ',substr(e.start_time,12,8)),from_unixtime(unix_timestamp()))
        else null end start_ahead_days
from
    dwd_events.train t left join dwd_events.events e on t.eventid=e.eventid;

select * from dwd_events.train where userid='user' limit 10;

select count(*) from user_event;

--  user_event表 与 user_friend表关联,找出event创建者是不是userid(train)的朋友
-- user_friend_count  关联条件 user_event.userid=user_friend_count.userid
--   gree    xiaofan     25/30=83.3

use dm_attendee;

drop table if exists friend_attend_percentage;
create table friend_attend_percentage stored as orc as
with temp1 as (
select
       ue.userid,
       ue.eventid,
       ue.interested,
       ue.invited,
       ue.event_creator,
       ue.event_city,
       ue.event_country,
       ue.event_state,
       ue.lat,
       ue.lng,
       ue.invite_ahead_days,
       ue.event_start_month,
       ue.event_start_dayofweek,
       ue.event_start_hour,
       ue.start_ahead_days,
       case when ufc.friendcount is not null then ufc.friendcount else 0 end as user_friend_count,
       case when uf.friendid is not null then 1 else 0 end as event_creator_is_friend,
       case when fas.invited_friends_count is not null then fas.invited_friends_count else 0 end invited_friends_count,
       case when fas.maybe_friends_count is not null then fas.maybe_friends_count else 0 end maybe_friends_count,
       case when fas.yes_friends_count is not null then fas.yes_friends_count else 0 end yes_friends_count,
       case when fas.no_friends_count is not null then fas.no_friends_count else 0 end no_friends_count
from dm_attendee.user_event ue left join dws_events.user_friend_count ufc on ue.userid=ufc.userid
left join dwd_events.user_friend uf on ue.userid=uf.userid and ue.event_creator=uf.friendid
left join dws_events.friend_attend_summary fas on fas.userid=ue.userid and ue.eventid=fas.eventid
     )
select
       temp1.userid,
       temp1.eventid,
       temp1.interested,
       temp1.invited,
       temp1.event_creator,
       temp1.event_city,
       temp1.event_country,
       temp1.event_state,
       temp1.lat,
       temp1.lng,
       temp1.invite_ahead_days,
       temp1.event_start_month,
       temp1.event_start_dayofweek,
       temp1.event_start_hour,
       temp1.start_ahead_days,
       temp1.user_friend_count,
       temp1.event_creator_is_friend,
       temp1.invited_friends_count,
       temp1.maybe_friends_count,
       temp1.yes_friends_count,
       temp1.no_friends_count,
      case when temp1.user_friend_count!=0 then temp1.invited_friends_count*100/temp1.user_friend_count else 0 end as invited_friends_percentage,
       case when temp1.user_friend_count!=0 then temp1.yes_friends_count*100/temp1.user_friend_count else 0 end as yes_friends_percentage,
       case when temp1.user_friend_count!=0 then temp1.no_friends_count*100/temp1.user_friend_count else 0 end as no_friends_percentage,
       case when temp1.user_friend_count!=0 then temp1.maybe_friends_count*100/temp1.user_friend_count else 0 end as maybe_friends_percentage
from temp1;

select count(*) from friend_attend_percentage;

--搭建一台虚拟-->.csv->flume-->kafka->kafkastream/sparkstream/flink-->hbase/mongodb(interface)-->hive(数仓库)-->特征值
-->finereport-->自动化部署   ?--》用户是否对某一件事情感兴趣预测

create temporary  macro locationSmililar(
    user_location string,
    event_city string,
    event_country string,
    event_state string
    )
case
    when instr(user_location,event_city) >0 or instr(user_location, event_country)>0
    or instr(user_location, event_state)>0 then 1 else 0
    end;


drop table if exists user_friend_event;
create table user_friend_event stored as orc as
select
   fap.userid,
   fap.eventid,
   cast(fap.interested as int) user_interested,
   cast(fap.invited as int) user_invited,
 --  fap.event_creator,
   case when ec.level is not null then ec.level else 0 end as event_city_level, --fap.event_city,
   case when ect.level is not null then ect.level else 0 end as event_country_level, --fap.event_country,
-- fap.event_state,
   case when fap.lat is not null then fap.lat else 0 end as event_lat,
   case when fap.lng is not null then fap.lng else 0 end as event_lng,
   fap.invite_ahead_days,
   fap.event_start_month,
   cast(fap.event_start_dayofweek as int) event_start_dayofweek,
   fap.event_start_hour,
   fap.start_ahead_days,
   fap.user_friend_count,
   fap.event_creator_is_friend,
   fap.invited_friends_count,
   fap.maybe_friends_count,
   fap.yes_friends_count,
   fap.no_friends_count,
   fap.invited_friends_percentage,
   fap.yes_friends_percentage,
   fap.no_friends_percentage,
   fap.maybe_friends_percentage,
    year(from_unixtime(unix_timestamp())) - u.birthyear as user_age,
    case when u.gender='male' then 1 when u.gender='female' then 0 else -1 end user_gender,
    datediff(from_unixtime(unix_timestamp()), concat(substr(u.joinedat,1,10))) user_member_days,
    locationSmililar(lower(u.location),fap.event_city,fap.event_country, fap.event_state) location_similar,
    case when uec.event_count         is not null then uec.event_count else 0 end as user_had_event_count,
    case when uaec.invited_count      is not null then uaec.invited_count else 0 end as user_invited_event_count,
    case when uaec.maybe_attend_count is not null then uaec.maybe_attend_count else 0 end as user_maybe_attend_event_count,
    case when uaec.no_attend_count    is not null then uaec.no_attend_count else 0 end as user_no_attend_event_count,
    case when uaec.yes_attend_count   is not null then uaec.yes_attend_count else 0 end as user_yes_attend_event_count,
    case when l.local_id is not null then l.local_id else 0 end user_locale,
    case when tz.time_zone_id is not null then tz.time_zone_id else 0 end user_time_zone,
    case when inv.attendcount is not null then inv.attendcount else 0 end event_invited_user_count,
    case when yes.attendcount is not null then yes.attendcount else 0 end event_yes_user_count,
    case when no.attendcount is not null then no.attendcount else 0 end event_no_user_count,
    case when maybe.attendcount is not null then maybe.attendcount else 0 end event_maybe_user_count
from friend_attend_percentage fap
    inner join dwd_events.users u on fap.userid=u.userid
    left join dws_events.event_cities ec on fap.event_city=ec.city
    left join dws_events.event_countries ect on fap.event_country=ect.country
    left join dws_events.user_event_count uec on uec.userid=fap.userid
    left join dws_events.user_attend_event_count uaec on uaec.userid=fap.userid
    left join dwd_events.`local` l on l.locale=u.locale
    left join dwd_events.time_zone tz on tz.time_zone=u.timezone
    left join dws_events.event_attendee_count inv on inv.eventid=fap.eventid and inv.state='invited'
     left join dws_events.event_attendee_count yes on yes.eventid=fap.eventid and yes.state='yes'
    left join dws_events.event_attendee_count no on no.eventid=fap.eventid and no.state='no'
    left join dws_events.event_attendee_count maybe on maybe.eventid=fap.eventid and maybe.state='maybe'

;

select count(*) from user_friend_event;

use dm_attendee;


select * from user_friend_event;

select count(*) from user_friend_event;

desc user_friend_event;



版权声明:本文为liuyongsheng666原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。