备注:
Hive 版本 2.1.1
文章目录
测试数据
-- create table
create table dept
(
deptno int,
dname varchar(14),
loc varchar(13)
);
insert into dept(deptno, dname, loc)
values ('10', 'accounting', 'new york');
insert into dept(deptno, dname, loc)
values ('20', 'research', 'dallas');
insert into dept(deptno, dname, loc)
values ('30', 'sales', 'chicago');
insert into dept(deptno, dname, loc)
values ('40', 'operations', 'boston');
-- create table
create table emp
(
empno int,
ename varchar(10),
job varchar(9),
mgr int,
hiredate date,
sal decimal(7,2),
comm decimal(7,2),
deptno int
) ;
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7369', 'smith', 'clerk', '7902','1980-12-17', '800', null, '20');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7499', 'allen', 'salesman', '7698', '1981-02-20', '1600', '300', '30');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7521', 'ward', 'salesman', '7698', '1981-02-22', '1250', '500', '30');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7566', 'jones', 'manager', '7839', '1981-04-02', '2975', null, '20');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7654', 'martin', 'salesman', '7698', '1981-09-28', '1250', '1400', '30');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7698', 'blake', 'manager', '7839', '1981-05-01', '2850', null, '30');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7782', 'clark', 'manager', '7839', '1981-06-09', '2450', null, '10');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7788', 'scott', 'analyst', '7566', '1987-06-13', '3000', null, '20');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7839', 'king', 'president', null, '1981-11-17', '5000', null, '10');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7844', 'turner', 'salesman', '7698', '1981-09-08', '1500', '0', '30');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7876', 'adams', 'clerk', '7788', '1987-06-13', '1100', null, '20');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7900', 'james', 'clerk', '7698', '1981-12-03', '950', null, '30');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7902', 'ford', 'analyst', '7566', '1981-12-03', '3000', null, '20');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7934', 'miller', 'clerk', '7782', '1982-01-23', '1300', null, '10');
create table salgrade
(
grade int,
losal int,
hisal int
) ;
insert into salgrade(grade, losal, hisal)
values ('1', '700', '1200');
insert into salgrade(grade, losal, hisal)
values ('2', '1201', '1400');
insert into salgrade(grade, losal, hisal)
values ('3', '1401', '2000');
insert into salgrade(grade, losal, hisal)
values ('4', '2001', '3000');
insert into salgrade(grade, losal, hisal)
values ('5', '3001', '9999');
create table bonus
(
ename varchar(10),
job varchar(9),
sal int,
comm int
);
函数名 | 函数用途 |
---|---|
row_number | 对行进行排序并为每一行增加一个唯一编号。这是一个非确定性函数 |
rank | 将数据行值按照排序后的顺序进行排名,在有并列的情况下排名值将被跳过 |
dense_rank | 将数据行值按照排序后的顺序进行排名,在有并列的情况下也不跳过排名值 |
lag | 访问一个分区或结果集中之前的一行 |
lead | 访问一个分区或结果集中之后的一行 |
first_value | 访问一个分区或结果集中第一行 |
last_value | 访问一个分区或结果集中最后一行 |
nth_value | 访问一个分区或结果集中的任意一行 |
percent_rank | 将计算得到的排名值标准化 |
CUME_DIST() | 将计算得到的排名值标准化 |
NTILE | 将数据进行将数据行分组为单元 |
分析函数有3个基本组成部分:
1.分区子句
2.排序子句
3.开窗子句
function1 (argument1,argument2,..argumentN)
over w
window w as ([partition-by-clause] [order-by-clause] [windowing-clause])
窗口说明子句的语法:
默认的窗口子句是rows between unbounded preceding and current row。如果你没有显示声明窗口,就将会使用默认窗口。
并不是所有的分析函数都支持开窗子句
(ROWS | RANGE) BETWEEN (UNBOUNDED | [num]) PRECEDING AND ([num] PRECEDING | CURRENT ROW | (UNBOUNDED | [num]) FOLLOWING)
(ROWS | RANGE) BETWEEN CURRENT ROW AND (CURRENT ROW | (UNBOUNDED | [num]) FOLLOWING)
(ROWS | RANGE) BETWEEN [num] FOLLOWING AND (UNBOUNDED | [num]) FOLLOWING
一.row_number、rank、dense_rank
row_number语法:
row_number() over w
window w as (partition-clause order-by-clause)
row_number不支持开窗子句
rank、dense_rank语法同row_number语法
现在需要对分不同部门来看部门内的工资排名,且从大到小排列:
select a.empno,
a.ename,
a.deptno,
a.sal,
row_number() over w as num,
rank() over w as rank,
dense_rank() over w as dense_rank
from emp a
window w as (partition by a.deptno order by a.sal desc)
;
测试记录:
– 可以看到deptno为30的员工工资有重复的,重复的工资为1250
– row_number() 不关注重复的,直接排名,1-2-3-4-5-6
– rank() 重复排名,会跳过,1-2-3-4-4-6
– dense_rank() 重复排名,不跳过,1-2-3-4-4-5
hive>
> select a.empno,
> a.ename,
> a.deptno,
> a.sal,
> row_number() over w as num,
> rank() over w as rank,
> dense_rank() over w as dense_rank
> from emp a
> window w as (partition by a.deptno order by a.sal desc)
> ;
Query ID = root_20201217135518_7a47a785-f5ec-41c6-9de8-34faf8e650b9
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0270, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0270/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0270
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2020-12-17 13:55:29,077 Stage-1 map = 0%, reduce = 0%
2020-12-17 13:55:35,318 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 2.05 sec
2020-12-17 13:55:36,351 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 3.97 sec
2020-12-17 13:55:40,485 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 6.16 sec
MapReduce Total cumulative CPU time: 6 seconds 160 msec
Ended Job = job_1606698967173_0270
MapReduce Jobs Launched:
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 6.16 sec HDFS Read: 18321 HDFS Write: 645 HDFS EC Read: 0 SUCCESS
Total MapReduce CPU Time Spent: 6 seconds 160 msec
OK
a.empno a.ename a.deptno a.sal num rank dense_rank
7839 king 10 5000.00 1 1 1
7782 clark 10 2450.00 2 2 2
7934 miller 10 1300.00 3 3 3
7788 scott 20 3000.00 1 1 1
7902 ford 20 3000.00 2 1 1
7566 jones 20 2975.00 3 3 2
7876 adams 20 1100.00 4 4 3
7369 smith 20 800.00 5 5 4
7698 blake 30 2850.00 1 1 1
7499 allen 30 1600.00 2 2 2
7844 turner 30 1500.00 3 3 3
7654 martin 30 1250.00 4 4 4
7521 ward 30 1250.00 5 4 4
7900 james 30 950.00 6 6 5
Time taken: 22.644 seconds, Fetched: 14 row(s)
hive>
二.lag、lead
lag语法:
lag (expression, offset, default) over w
window w as (partition-clause order-by-clause)
lag不支持开窗子句
lead同lag语法
代码:
-- 根据分组,取值上n条和下n条 如果是第一条或最后一条,就给个默认值
SELECT a.empno,
a.deptno,
a.hiredate,
a.sal,
lag(sal, 1, 0) over w as pre_sal,
lead(sal, 1, 0) over w as next_sal,
lag(sal, 2, 0) over w as pre2_sal,
lead(sal, 2, 0) over w as next_2sal
FROM emp a
window w as (PARTITION BY a.deptno ORDER BY hiredate ASC)
;
-- 没有比自己小我的我们设为AAA,没有比自己大的,我们设置为ZZZ
select deptno,
ename,
lag(ename, 1, 'AAA') over w as lower_name,
lead(ename, 1, 'ZZZ') over w as higher_name
from emp
window w as(PARTITION BY deptno ORDER BY ename)
;
-- 部门重复的话值输出第一行的部门编号
select (case when deptno= lag(deptno,1)over w then null else deptno end) as deptno,
ename,
lag(ename, 1, 'AAA') over w as lower_name,
lead(ename, 1, 'ZZZ') over w as higher_name
from emp
window w as (PARTITION BY deptno ORDER BY ename)
;
测试记录:
hive> -- 根据分组,取值上n条和下n条 如果是第一条或最后一条,就给个默认值
hive> SELECT a.empno,
> a.deptno,
> a.hiredate,
> a.sal,
> lag(sal, 1, 0) over w as pre_sal,
> lead(sal, 1, 0) over w as next_sal,
> lag(sal, 2, 0) over w as pre2_sal,
> lead(sal, 2, 0) over w as next_2sal
> FROM emp a
> window w as (PARTITION BY a.deptno ORDER BY hiredate ASC)
> ;
Query ID = root_20201217140032_63a2024b-7a8f-452a-8cf7-c27653105565
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0271, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0271/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0271
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2020-12-17 14:00:39,812 Stage-1 map = 0%, reduce = 0%
2020-12-17 14:00:46,008 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 3.99 sec
2020-12-17 14:00:52,200 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 6.39 sec
MapReduce Total cumulative CPU time: 6 seconds 390 msec
Ended Job = job_1606698967173_0271
MapReduce Jobs Launched:
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 6.39 sec HDFS Read: 19017 HDFS Write: 1021 HDFS EC Read: 0 SUCCESS
Total MapReduce CPU Time Spent: 6 seconds 390 msec
OK
a.empno a.deptno a.hiredate a.sal pre_sal next_sal pre2_sal next_2sal
7782 10 1981-06-09 2450.00 0.00 5000.00 0.00 1300.00
7839 10 1981-11-17 5000.00 2450.00 1300.00 0.00 0.00
7934 10 1982-01-23 1300.00 5000.00 0.00 2450.00 0.00
7369 20 1980-12-17 800.00 0.00 2975.00 0.00 3000.00
7566 20 1981-04-02 2975.00 800.00 3000.00 0.00 3000.00
7902 20 1981-12-03 3000.00 2975.00 3000.00 800.00 1100.00
7788 20 1987-06-13 3000.00 3000.00 1100.00 2975.00 0.00
7876 20 1987-06-13 1100.00 3000.00 0.00 3000.00 0.00
7499 30 1981-02-20 1600.00 0.00 1250.00 0.00 2850.00
7521 30 1981-02-22 1250.00 1600.00 2850.00 0.00 1500.00
7698 30 1981-05-01 2850.00 1250.00 1500.00 1600.00 1250.00
7844 30 1981-09-08 1500.00 2850.00 1250.00 1250.00 950.00
7654 30 1981-09-28 1250.00 1500.00 950.00 2850.00 0.00
7900 30 1981-12-03 950.00 1250.00 0.00 1500.00 0.00
Time taken: 20.831 seconds, Fetched: 14 row(s)
hive>
> -- 没有比自己小我的我们设为AAA,没有比自己大的,我们设置为ZZZ
> select deptno,
> ename,
> lag(ename, 1, 'AAA') over w as lower_name,
> lead(ename, 1, 'ZZZ') over w as higher_name
> from emp
> window w as(PARTITION BY deptno ORDER BY ename)
> ;
Query ID = root_20201217140106_f847f43a-b525-40dc-a276-9f79d2c538f4
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0272, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0272/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0272
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2020-12-17 14:01:13,271 Stage-1 map = 0%, reduce = 0%
2020-12-17 14:01:19,458 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 3.96 sec
2020-12-17 14:01:24,610 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 6.37 sec
MapReduce Total cumulative CPU time: 6 seconds 370 msec
Ended Job = job_1606698967173_0272
MapReduce Jobs Launched:
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 6.37 sec HDFS Read: 17623 HDFS Write: 537 HDFS EC Read: 0 SUCCESS
Total MapReduce CPU Time Spent: 6 seconds 370 msec
OK
deptno ename lower_name higher_name
10 clark AAA king
10 king clark miller
10 miller king ZZZ
20 adams AAA ford
20 ford adams jones
20 jones ford scott
20 scott jones smith
20 smith scott ZZZ
30 allen AAA blake
30 blake allen james
30 james blake martin
30 martin james turner
30 turner martin ward
30 ward turner ZZZ
Time taken: 20.52 seconds, Fetched: 14 row(s)
hive> -- 部门重复的话值输出第一行的部门编号
hive> select (case when deptno= lag(deptno,1)over w then null else deptno end) as deptno,
> ename,
> lag(ename, 1, 'AAA') over w as lower_name,
> lead(ename, 1, 'ZZZ') over w as higher_name
> from emp
> window w as (PARTITION BY deptno ORDER BY ename)
> ;
Query ID = root_20201217140128_f6137d9e-0feb-434e-9a28-766833bec6d2
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0273, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0273/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0273
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2020-12-17 14:01:37,413 Stage-1 map = 0%, reduce = 0%
2020-12-17 14:01:43,610 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 4.06 sec
2020-12-17 14:01:49,800 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 7.55 sec
MapReduce Total cumulative CPU time: 7 seconds 550 msec
Ended Job = job_1606698967173_0273
MapReduce Jobs Launched:
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 7.55 sec HDFS Read: 18183 HDFS Write: 537 HDFS EC Read: 0 SUCCESS
Total MapReduce CPU Time Spent: 7 seconds 550 msec
OK
deptno ename lower_name higher_name
10 clark AAA king
NULL king clark miller
NULL miller king ZZZ
20 adams AAA ford
NULL ford adams jones
NULL jones ford scott
NULL scott jones smith
NULL smith scott ZZZ
30 allen AAA blake
NULL blake allen james
NULL james blake martin
NULL martin james turner
NULL turner martin ward
NULL ward turner ZZZ
Time taken: 21.956 seconds, Fetched: 14 row(s)
hive>
三.first_value、last_value
first_value、last_value语法:
first_value(expression) over w
window w as (partition-clause order-by-clause windowing-clause)
last_value(expression) over w
window w as (partition-clause order-by-clause windowing-clause)
代码:
/*
需求:求每个部门工资最高的和工资最低的
*/
-- 默认不带开窗子句,从第一行到当前行
select a.empno,a.deptno,a.sal,
first_value(a.sal) over w as first,
last_value(a.sal) over w as last
from emp a
window w as (partition by a.deptno order by sal)
;
-- rows between unbounded preceding and current row 从第一行到当前行
select a.empno,a.deptno,a.sal,
first_value(a.sal) over w as first,
last_value(a.sal) over w as last
from emp a
window w as (partition by a.deptno order by sal rows between unbounded preceding and current row)
;
-- rows between unbounded preceding and unbounded following 从第一行到最后一行
select a.empno,a.deptno,a.sal,
first_value(a.sal) over w as first,
last_value(a.sal) over w as last
from emp a
window w as (partition by a.deptno order by sal rows between unbounded preceding and unbounded following)
;
-- 1 preceding and 1 following 当前行的前一行到当前行的后一行
select a.empno,a.deptno,a.sal,
first_value(a.sal) over w as first,
last_value(a.sal) over w as last
from emp a
window w as (partition by a.deptno order by sal rows between 1 preceding and 1 following)
;
测试记录:
hive>
> -- 默认不带开窗子句,从第一行到当前行
> select a.empno,a.deptno,a.sal,
> first_value(a.sal) over w as first,
> last_value(a.sal) over w as last
> from emp a
> window w as (partition by a.deptno order by sal)
> ;
Query ID = root_20201217140513_e49fe743-41eb-4291-a7a3-afef68e208f3
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0274, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0274/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0274
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2020-12-17 14:05:20,777 Stage-1 map = 0%, reduce = 0%
2020-12-17 14:05:26,956 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 4.07 sec
2020-12-17 14:05:33,130 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 6.59 sec
MapReduce Total cumulative CPU time: 6 seconds 590 msec
Ended Job = job_1606698967173_0274
MapReduce Jobs Launched:
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 6.59 sec HDFS Read: 18024 HDFS Write: 688 HDFS EC Read: 0 SUCCESS
Total MapReduce CPU Time Spent: 6 seconds 590 msec
OK
a.empno a.deptno a.sal first last
7934 10 1300.00 1300.00 1300.00
7782 10 2450.00 1300.00 2450.00
7839 10 5000.00 1300.00 5000.00
7369 20 800.00 800.00 800.00
7876 20 1100.00 800.00 1100.00
7566 20 2975.00 800.00 2975.00
7788 20 3000.00 800.00 3000.00
7902 20 3000.00 800.00 3000.00
7900 30 950.00 950.00 950.00
7654 30 1250.00 950.00 1250.00
7521 30 1250.00 950.00 1250.00
7844 30 1500.00 950.00 1500.00
7499 30 1600.00 950.00 1600.00
7698 30 2850.00 950.00 2850.00
Time taken: 20.46 seconds, Fetched: 14 row(s)
hive> -- rows between unbounded preceding and current row 从第一行到当前行
hive> select a.empno,a.deptno,a.sal,
> first_value(a.sal) over w as first,
> last_value(a.sal) over w as last
> from emp a
> window w as (partition by a.deptno order by sal rows between unbounded preceding and current row)
> ;
Query ID = root_20201217140542_13d4a59c-1986-4c22-a894-83b31f8b49ab
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0275, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0275/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0275
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2020-12-17 14:05:49,767 Stage-1 map = 0%, reduce = 0%
2020-12-17 14:05:55,936 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 4.01 sec
2020-12-17 14:06:01,081 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 6.19 sec
MapReduce Total cumulative CPU time: 6 seconds 190 msec
Ended Job = job_1606698967173_0275
MapReduce Jobs Launched:
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 6.19 sec HDFS Read: 17930 HDFS Write: 688 HDFS EC Read: 0 SUCCESS
Total MapReduce CPU Time Spent: 6 seconds 190 msec
OK
a.empno a.deptno a.sal first last
7934 10 1300.00 1300.00 1300.00
7782 10 2450.00 1300.00 2450.00
7839 10 5000.00 1300.00 5000.00
7369 20 800.00 800.00 800.00
7876 20 1100.00 800.00 1100.00
7566 20 2975.00 800.00 2975.00
7788 20 3000.00 800.00 3000.00
7902 20 3000.00 800.00 3000.00
7900 30 950.00 950.00 950.00
7654 30 1250.00 950.00 1250.00
7521 30 1250.00 950.00 1250.00
7844 30 1500.00 950.00 1500.00
7499 30 1600.00 950.00 1600.00
7698 30 2850.00 950.00 2850.00
Time taken: 20.402 seconds, Fetched: 14 row(s)
hive> -- rows between unbounded preceding and unbounded following 从第一行到最后一行
hive> select a.empno,a.deptno,a.sal,
> first_value(a.sal) over w as first,
> last_value(a.sal) over w as last
> from emp a
> window w as (partition by a.deptno order by sal rows between unbounded preceding and unbounded following)
> ;
Query ID = root_20201217140615_8d331653-75fd-429f-8989-55fcd0e86cfb
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0276, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0276/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0276
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2020-12-17 14:06:22,884 Stage-1 map = 0%, reduce = 0%
2020-12-17 14:06:30,087 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 3.99 sec
2020-12-17 14:06:35,236 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 6.39 sec
MapReduce Total cumulative CPU time: 6 seconds 390 msec
Ended Job = job_1606698967173_0276
MapReduce Jobs Launched:
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 6.39 sec HDFS Read: 17896 HDFS Write: 690 HDFS EC Read: 0 SUCCESS
Total MapReduce CPU Time Spent: 6 seconds 390 msec
OK
a.empno a.deptno a.sal first last
7934 10 1300.00 1300.00 5000.00
7782 10 2450.00 1300.00 5000.00
7839 10 5000.00 1300.00 5000.00
7369 20 800.00 800.00 3000.00
7876 20 1100.00 800.00 3000.00
7566 20 2975.00 800.00 3000.00
7788 20 3000.00 800.00 3000.00
7902 20 3000.00 800.00 3000.00
7900 30 950.00 950.00 2850.00
7654 30 1250.00 950.00 2850.00
7521 30 1250.00 950.00 2850.00
7844 30 1500.00 950.00 2850.00
7499 30 1600.00 950.00 2850.00
7698 30 2850.00 950.00 2850.00
Time taken: 20.875 seconds, Fetched: 14 row(s)
hive> -- 1 preceding and 1 following 当前行的前一行到当前行的后一行
hive> select a.empno,a.deptno,a.sal,
> first_value(a.sal) over w as first,
> last_value(a.sal) over w as last
> from emp a
> window w as (partition by a.deptno order by sal rows between 1 preceding and 1 following)
> ;
Query ID = root_20201217140640_8e3c39f1-104b-42ad-a8fc-8d0294cb895c
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0277, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0277/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0277
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2020-12-17 14:06:47,701 Stage-1 map = 0%, reduce = 0%
2020-12-17 14:06:52,960 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 1.99 sec
2020-12-17 14:06:53,990 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 4.12 sec
2020-12-17 14:06:59,134 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 6.23 sec
MapReduce Total cumulative CPU time: 6 seconds 230 msec
Ended Job = job_1606698967173_0277
MapReduce Jobs Launched:
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 6.23 sec HDFS Read: 17864 HDFS Write: 697 HDFS EC Read: 0 SUCCESS
Total MapReduce CPU Time Spent: 6 seconds 230 msec
OK
a.empno a.deptno a.sal first last
7934 10 1300.00 1300.00 2450.00
7782 10 2450.00 1300.00 5000.00
7839 10 5000.00 2450.00 5000.00
7369 20 800.00 800.00 1100.00
7876 20 1100.00 800.00 2975.00
7566 20 2975.00 1100.00 3000.00
7788 20 3000.00 2975.00 3000.00
7902 20 3000.00 3000.00 3000.00
7900 30 950.00 950.00 1250.00
7654 30 1250.00 950.00 1250.00
7521 30 1250.00 1250.00 1500.00
7844 30 1500.00 1250.00 1600.00
7499 30 1600.00 1500.00 2850.00
7698 30 2850.00 1600.00 2850.00
Time taken: 19.959 seconds, Fetched: 14 row(s)
hive>
四.percent_rank、CUME_DIST
percent_rank语法:
percent_rank() over w
window w as ([partition-by-clause] [order-by-clause] )
CUME_DIST语法
cume_dist() over w
window w as ([partition-by-clause] [order-by-clause] )
percent_rank:
– percent_rank函数以0到1之间的分数形式返回某个值在数据分区中的排名
– percent_rank的计算公式为(rank-1)/(n-1)
CUME_DIST:
–一个5行的组中,返回的累计分布值为0.2,0.4,0.6,0.8,1.0;
–注意对于重复行,计算时取重复行中的最后一行的位置。
代码:
SELECT a.empno,
a.ename,
a.deptno,
a.sal,
percent_rank() over w as num,
cume_dist() over w as cume
FROM emp a
window w as (PARTITION BY a.deptno ORDER BY a.sal DESC)
;
测试记录:
hive>
>
> SELECT a.empno,
> a.ename,
> a.deptno,
> a.sal,
> percent_rank() over w as num,
> cume_dist() over w as cume
> FROM emp a
> window w as (PARTITION BY a.deptno ORDER BY a.sal DESC)
> ;
Query ID = root_20201217140833_0c946fa7-a362-4141-a863-3f8a4674afba
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0278, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0278/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0278
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2020-12-17 14:08:40,917 Stage-1 map = 0%, reduce = 0%
2020-12-17 14:08:48,122 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 3.91 sec
2020-12-17 14:08:54,301 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 6.37 sec
MapReduce Total cumulative CPU time: 6 seconds 370 msec
Ended Job = job_1606698967173_0278
MapReduce Jobs Launched:
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 6.37 sec HDFS Read: 18317 HDFS Write: 765 HDFS EC Read: 0 SUCCESS
Total MapReduce CPU Time Spent: 6 seconds 370 msec
OK
a.empno a.ename a.deptno a.sal num cume
7839 king 10 5000.00 0.0 0.3333333333333333
7782 clark 10 2450.00 0.5 0.6666666666666666
7934 miller 10 1300.00 1.0 1.0
7788 scott 20 3000.00 0.0 0.4
7902 ford 20 3000.00 0.0 0.4
7566 jones 20 2975.00 0.5 0.6
7876 adams 20 1100.00 0.75 0.8
7369 smith 20 800.00 1.0 1.0
7698 blake 30 2850.00 0.0 0.16666666666666666
7499 allen 30 1600.00 0.2 0.3333333333333333
7844 turner 30 1500.00 0.4 0.5
7654 martin 30 1250.00 0.6 0.8333333333333334
7521 ward 30 1250.00 0.6 0.8333333333333334
7900 james 30 950.00 1.0 1.0
Time taken: 21.471 seconds, Fetched: 14 row(s)
hive>
五.ntile
Ntile语法:
Ntile(expr) OVER w
window w as ([ query_partition_clause ] order_by_clause)
Ntile 把数据行分成N个桶。每个桶会有相同的行数,正负误差为1
将员工表emp按照工资分为2、3个桶
代码:
-- 分成2个桶
SELECT ENAME, SAL, NTILE(2) OVER w as n FROM EMP
window w as (ORDER BY SAL ASC)
;
-- 分成3个桶
SELECT ENAME, SAL, NTILE(3) OVER w as n FROM EMP
window w as (ORDER BY SAL ASC)
;
测试记录:
hive>
> -- 分成2个桶
> SELECT ENAME, SAL, NTILE(2) OVER w as n FROM EMP
> window w as (ORDER BY SAL ASC)
> ;
Query ID = root_20201217141022_8d69be19-e30a-4d30-9e20-63e13d915521
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0279, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0279/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0279
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2020-12-17 14:10:28,769 Stage-1 map = 0%, reduce = 0%
2020-12-17 14:10:34,943 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 2.04 sec
2020-12-17 14:10:35,973 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 3.97 sec
2020-12-17 14:10:41,117 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 6.4 sec
MapReduce Total cumulative CPU time: 6 seconds 400 msec
Ended Job = job_1606698967173_0279
MapReduce Jobs Launched:
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 6.4 sec HDFS Read: 17143 HDFS Write: 477 HDFS EC Read: 0 SUCCESS
Total MapReduce CPU Time Spent: 6 seconds 400 msec
OK
ename sal n
smith 800.00 1
james 950.00 1
adams 1100.00 1
ward 1250.00 1
martin 1250.00 1
miller 1300.00 1
turner 1500.00 1
allen 1600.00 2
clark 2450.00 2
blake 2850.00 2
jones 2975.00 2
scott 3000.00 2
ford 3000.00 2
king 5000.00 2
Time taken: 19.869 seconds, Fetched: 14 row(s)
hive> -- 分成3个桶
hive> SELECT ENAME, SAL, NTILE(3) OVER w as n FROM EMP
> window w as (ORDER BY SAL ASC)
> ;
Query ID = root_20201217141102_5483203c-32bc-45cc-b7ab-65eeaa061696
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0280, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0280/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0280
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2020-12-17 14:11:08,889 Stage-1 map = 0%, reduce = 0%
2020-12-17 14:11:15,086 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 2.04 sec
2020-12-17 14:11:16,115 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 4.01 sec
2020-12-17 14:11:21,255 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 6.22 sec
MapReduce Total cumulative CPU time: 6 seconds 220 msec
Ended Job = job_1606698967173_0280
MapReduce Jobs Launched:
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 6.22 sec HDFS Read: 17143 HDFS Write: 477 HDFS EC Read: 0 SUCCESS
Total MapReduce CPU Time Spent: 6 seconds 220 msec
OK
ename sal n
smith 800.00 1
james 950.00 1
adams 1100.00 1
ward 1250.00 1
martin 1250.00 1
miller 1300.00 2
turner 1500.00 2
allen 1600.00 2
clark 2450.00 2
blake 2850.00 2
jones 2975.00 3
scott 3000.00 3
ford 3000.00 3
king 5000.00 3
Time taken: 19.511 seconds, Fetched: 14 row(s)
hive>
参考
1.https://cwiki.apache.org/confluence/display/Hive/LanguageManual+WindowingAndAnalytics