文章目录
Hive实战之谷粒影音
1. 项目数据结构
1.1 视频表
字段 | 备注 | 详细描述 |
---|---|---|
video_id | 视频唯一 id | 11 位字符串 |
uploader | 视频上传者 | 上传视频的用户名 String |
age | 视频年龄 | 视频在平台上的整数天 |
category | 视频类别 | 上传视频指定的视频分类 |
length | 视频长度 | 整形数字标识的视频长度 |
views | 观看次数 | 视频被浏览的次数 |
rate | 视频评分 | 满分 5 分 |
ratings | 流量 | 视频的流量,整型数字 |
conments | 评论数 | 一个视频的整数评论数 |
related_ids | 相关视频 id | 相关视频的 id,最多 20 个 |
1.2 用户表
字段 | 备注 | 字段类型 |
---|---|---|
uploader | 上传者用户名 | string |
videos | 上传视频数 | int |
friends | 朋友数量 | int |
2. 数据清洗
- 通过观察
原始数据形式
,可以发现,视频可以有多个所属分类,每个所属分类用&符号分割,且分割的两边有空格字符,同时相关视频也是可以有多个元素,多个相关视频又用“\t” 进行分割。- 为了分析数据时方便对存在多个子元素的数据进行操作,我们首先进行数据重组清洗操作。
- 即:
将所有的类别用“&”分割,同时去掉两边空格,多个相关视频 id 也使用“&”进行分割。
|
|
|
|
|
|
|
|
|
|
---|---|---|---|---|---|---|---|---|---|
|
|
|
|
|
|
|
|
|
|
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.atguigu</groupId>
<artifactId>guli-video</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.2</version>
</dependency>
</dependencies>
</project>
2.1 ETLMapper
/**
* @Date 2020/8/7 13:37
* @Version 10.21
* @Author DuanChaojie
*/
public class ETLMapper extends Mapper<LongWritable, Text, NullWritable,Text> {
Text v = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1.获取数据
String oriStr = value.toString();
// 2.过滤数据
String etlStr = ETLUtil.etlStr(oriStr);
// 3.写出
if(etlStr == null){
return;
}
v.set(etlStr);
context.write(NullWritable.get(),v);
}
}
2.2 ETLUtil
public class ETLUtil {
public static String etlStr(String oriStr){
StringBuffer sb = new StringBuffer();
// 1.过滤长度不够的,小于9个字段的
//SDNkMu8ZT68 w00dy911 630 People & Blogs 186 10181 3.49 494 257 rjnbgpPJUksr jnbgpPJUks
int len = 9;
String[] fields = oriStr.split("\t");
if(fields.length < len){
return null;
}
// 2.去掉类别字段中的空格
fields[3] = fields[3].replaceAll(" ","");
// 3.修改相关视频ID字段的分隔符
for (int i = 0; i < fields.length; i++) {
if (i < len){
sb.append(fields[i]).append("\t");
}else{
if (i == fields.length-1){
sb.append(fields[i]);
}else{
sb.append(fields[i]).append("&");
}
}
}
return sb.toString();
}
}
2.3 ETLDriver
public class ETLDriver implements Tool {
Configuration conf = new Configuration();
public int run(String[] args) throws Exception {
// 1.获取Job对象
Job job = Job.getInstance(conf);
// 2.获取jar包路径
job.setJarByClass(ETLDriver.class);
// 3.设置Mapper类和输出KV类型
job.setMapperClass(ETLMapper.class);
// 4.设置最终输出的KV类型
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
// 5.设置输入输出路径
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
// 6.提交任务
boolean result = job.waitForCompletion(true);
return result ? 0:1;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return conf;
}
public static void main(String[] args) {
Configuration conf = new Configuration();
try {
int run = ToolRunner.run(conf, new ETLDriver(), args);
System.out.println("run = " + run);
} catch (Exception e) {
e.printStackTrace();
}
}
}
打成Jar包
guli-video-1.0-SNAPSHOT.jar
hadoop fs -mkdir /gulivideo
hadoop fs -put user/ /gulivideo
hadoop fs -put video/ /gulivideo
# 进行数据清洗
yarn jar /opt/module/data/gulivideo/jars/guli-video-1.0-SNAPSHOT.jar com.atguigu.mr.ETLDriver /gulivideo/video/2008/0222 /gulivideo/output
清洗后的数据
3. 项目准备工作
3.1 创建表
创建表:
gulivideo_ori,gulivideo_user_ori,
存储为
textfile
格式
create table gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
create table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
创建表:
gulivideo_orc,gulivideo_user_orc
,存储为
orc
格式
create table gulivideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as orc;
create table gulivideo_user_orc(
uploader string,
videos int,
friends int)
row format delimited fields terminated by "\t"
stored as orc;
3.2 导入ETL后数据
-- 向gulivideo_ori表导入数据
load data inpath "/gulivideo/output/part-r-00000" into table gulivideo_ori;
-- 向gulivideo_user_ori表导入数据
load data inpath "/gulivideo/user/2008/0903" into table gulivideo_user_ori;
向 ORC 表插入数据
insert into table gulivideo_orc select * from gulivideo_ori;
insert into table gulivideo_user_orc select * from gulivideo_user_ori;
4. 业务分析
统计硅谷影音视频网站的常规指标,各种 TopN 指标:
–统计视频观看数 Top10
–统计视频类别热度 Top10
–统计视频观看数 Top20 所属类别以及类别包含的 Top20 的视频个数
–统计视频观看数 Top50 所关联视频的所属类别 Rank
–统计上传视频最多的用户 Top10 以及他们上传的观看次数在前 20 视频
–统计每个类别中的视频热度 Top10
–统计每个类别中视频流量 Top10
–统计每个类别视频观看数 Top10
4.1 统计视频观看数 Top10
select
videoId,
views,
uploader
from
gulivideo_orc
order by
views desc
limit
10;
-- 结果:
videoid views uploader
dMH0bHeiRNg 42513417 judsonlaipply
0XxI-hvPRRA 20282464 smosh
1dmVU08zVpA 16087899 NBC
RB-wUgnyGv0 15712924 ChrisInScotland
QjA5faZF1A8 15256922 guitar90
-_CSo1gOd48 13199833 tasha
49IDp76kjPw 11970018 TexMachina
tYnn51C3X_w 11823701 CowSayingMoo
pv5zWaTEVkI 11672017 OkGo
D2kJZOfq7zk 11184051 mrWoot
4.2 统计视频类别热度 Top10
-- 这里使用limit是为了方便查看结果
-- 使用UDTF函数将类别列炸开
select
videoId,
category_name
from
gulivideo_orc
lateral view explode(category) tmp_tab as category_name
limit 10;t1
-- 结果:
videoid category_name
o4x-VW_rCSE Entertainment
P1OXAQHv09E Comedy
N0TR0Irx4Y0 Comedy
seGhTWE98DU Music
bNF_P281Uu4 Travel
bNF_P281Uu4 Places
CQO3K8BcyGM Comedy
3gg5LOd_Zus Entertainment
sdUUx5FdySs Film
sdUUx5FdySs Animation
-- 按照category_name进行分组,统计每种类别视频的总数,同时按照该总数进行倒序排名,取前10
select
category_name,
count(category_name) category_count
from (
select
videoId,
category_name
from
gulivideo_orc
lateral view explode(category) tmp_tab as category_name
)t1
group by
category_name
order by
category_count desc
limit 10;
-- 结果:
category_name category_count
Music 179049
Entertainment 127674
Comedy 87818
Animation 73293
Film 73293
Sports 67329
Gadgets 59817
Games 59817
Blogs 48890
People 48890
4.3 统计视频观看数 Top20 所属类别以及类别包含的 Top20 的视频个数
-- 统计视频观看数Top20
select
videoId,
views,
category
from
gulivideo_orc
order by
views desc
limit 20;t1
-- 结果:
videoid views category
dMH0bHeiRNg 42513417 ["Comedy"]
0XxI-hvPRRA 20282464 ["Comedy"]
1dmVU08zVpA 16087899 ["Entertainment"]
RB-wUgnyGv0 15712924 ["Entertainment"]
QjA5faZF1A8 15256922 ["Music"]
-_CSo1gOd48 13199833 ["People","Blogs"]
-- ...
-- 对t1表中的category进行炸裂
select
videoId,
category_name
from
(select
videoId,
views,
category
from
gulivideo_orc
order by
views desc
limit 20)t1
lateral view explode(category) tmp_tab as category_name;t2
-- 结果:
videoid category_name
dMH0bHeiRNg Comedy
0XxI-hvPRRA Comedy
1dmVU08zVpA Entertainment
-- ...
-- 对t2表进行分组(category_name)求和(总数)
select
category_name,
count(*) category_count
from
(select
videoId,
category_name
from
(select
videoId,
views,
category
from
gulivideo_orc
order by
views desc
limit 20)t1
lateral view explode(category) tmp_tab as category_name)t2
group by
category_name
order by
category_count desc;
-- 结果:
category_name category_count
Entertainment 6
Comedy 6
Music 5
People 2
Blogs 2
UNA 1
4.4 统计视频观看数 Top50 所关联视频的所属类别 Rank
select
category
from
(select
related_id
from
(select
relatedId,
views
from
gulivideo_orc
order by
views desc
limit 50)t1
lateral view explode(relatedId) tmp_tab as related_id
group by
related_id)t2
join
gulivideo_orc orc
on
t2.related_id = orc.videoId;t3
-- 结果:
["Music"]
["Comedy"]
["Entertainment"]
["People","Blogs"]
["Comedy"]
["Comedy"]
["Comedy"]
["Entertainment"]
["Comedy"]
-- 对t3表中的category进行炸裂
select
explode(category) category_name
from
(select
category
from
(select
related_id
from
(select
relatedId,
views
from
gulivideo_orc
order by
views desc
limit 50)t1
lateral view explode(relatedId) tmp_tab as related_id
group by
related_id)t2
join
gulivideo_orc orc
on
t2.related_id = orc.videoId)t3;t4
-- 结果:
Music
Music
UNA
Entertainment
Music
Music
-- 分组(类别)求和(总数)
select
category_name,
count(category_name) category_count
from
(select
explode(category) category_name
from
(select
category
from
(select
related_id
from
(select
relatedId,
views
from
gulivideo_orc
order by
views desc
limit 50)t1
lateral view explode(relatedId) tmp_tab as related_id
group by
related_id)t2
join
gulivideo_orc orc
on
t2.related_id = orc.videoId)t3)t4
group by
category_name
order by
category_count desc;
-- 结果:
category_name category_count
Comedy 232
Entertainment 216
Music 195
Blogs 51
People 51
Film 47
Animation 47
News 22
Politics 22
Games 20
Gadgets 20
Sports 19
Howto 14
DIY 14
UNA 13
Places 12
Travel 12
Animals 11
Pets 11
Autos 4
Vehicles 4
4.5 统计上传视频最多的用户Top10以及他们上传的观看次数在前20视频
-- 统计上传视频最多的用户Top10
select
uploader,
videos
from
gulivideo_user_orc
order by
videos desc
limit 10;t1
-- 取出这10个人上传的所有视频,按照观看次数进行排名,取前20
select
gulivideo_orc.videoId,
gulivideo_orc.views
from
(select
uploader,
videos
from
gulivideo_user_orc
order by
videos desc
limit 10)t1
join
gulivideo_orc
on
t1.uploader=gulivideo_orc.uploader
order by
views desc
limit 20;
--结果:
video.videoid video.views
-IxHBW0YpZw 39059
BU-fT5XI_8I 29975
ADOcaBYbMl0 26270
yAqsULIDJFE 25511
vcm-t0TJXNg 25366
0KYGFawp14c 24659
j4DpuPvMLF4 22593
Msu4lZb2oeQ 18822
ZHZVj44rpjE 16304
foATQY3wovI 13576
-UnQ8rcBOQs 13450
crtNd46CDks 11639
D1leA0JKHhE 11553
NJu2oG1Wm98 11452
CapbXdyv4j4 10915
epr5erraEp4 10817
IyQoDgaLM7U 10597
tbZibBnusLQ 10402
_GnCHodc7mk 9422
hvEYlSlRitU 7123
4.6 统计每个类别(以Music为例)…
要想统计 Music 类别中的视频热度Top10,需要先找到 Music类别,那么就需要将 category
展开,所以可以创建一张表用于存放 categoryId 展开的数据。向 category 展开的表中插入数据。
create table gulivideo_category( videoId string, uploader string, age int, categoryId string, length int, views int, rate float, ratings int, comments int, relatedId array<string>) row format delimited fields terminated by "\t" collection items terminated by "&" stored as orc; insert into table gulivideo_category select videoId, uploader, age, categoryId, length, views, rate, ratings, comments, relatedId from gulivideo_orc lateral view explode(category) catetory as categoryId; select count(*) from gulivideo_category; -- 结果:1019206 select count(*) from gulivideo_orc; -- 结果:743569
统计对应类别(Music)中的视频热度
-- 给每一种类别根据视频观看数添加rank值(倒序) select categoryId, videoId, views, rank() over(partition by categoryId order by views desc) rk from gulivideo_category; -- 过滤前十 select categoryId, videoId, views from (select categoryId, videoId, views, rank() over(partition by categoryId order by views desc) rk from gulivideo_category)t1 where rk<=10; -- 结果: categoryid videoid views Animals 2GWPOPSXGYI 3660009 Animals xmsV9R8FsDA 3164582 Animals 12PsUW-8ge4 3133523 Animals OeNggIGSKH8 2457750 Animals WofFb_eOxxA 2075728 Animals AgEmZ39EtFk 1999469 Animals a-gW3RbJd8U 1836870 Animals 8CL2hetqpfg 1646808 Animals QmroaYVD_so 1645984 Animals Sg9x5mUjbH8 1527238 -- ... Vehicles RjrEQaG5jPM 2803140 Vehicles cv157ZIInUk 2773979 Vehicles Gyg9U1YaVk8 1832224 Vehicles 6GNB7xT3rNE 1412497 Vehicles tth9krDtxII 1347317 Vehicles 46LQd9dXFRU 1262173 Vehicles pdiuDXwgrjQ 1013697 Vehicles kY_cDpENQLE 956665 Vehicles YtxfbxGz1u4 942604 Vehicles aCamHfJwSGU 847442
☆