HIVE查询以及窗口函数示例

–order by：指定列排序

select 
name,      
dept_num,   
employee_id,
salary    
from employee_contract
order by salary desc;

在这里插入图片描述

–按指定列的位排序

set hive.groupby.orderby.position.alias=true;

select 
name,      
dept_num,   
employee_id,
salary    
from employee_contract
order by 3 desc;

在这里插入图片描述

–查询reduces个数

set mapreduce.job.reduces;

–sort by：没设置mapreduce数量前


select 
name,      
dept_num,   
employee_id,
salary    
from employee_contract
sort by salary desc;

在这里插入图片描述

–sort by：设置mapreduce数量

set mapreduce.job.reduces=2;

select 
name,      
dept_num,   
employee_id,
salary    
from employee_contract
sort by salary desc;

在这里插入图片描述

–distribute by + sort by

select 
name,      
dept_num,   
employee_id,
salary    
from employee_contract
distribute by dept_num sort by salary;

在这里插入图片描述

–强行设置mapreduce=1后，就还是全局排序

set mapreduce.job.reduces=1;
select 
name,      
dept_num,   
employee_id,
salary    
from employee_contract
distribute by dept_num sort by salary;

在这里插入图片描述

–cluster by=distribute by + sort by

set mapreduce.job.reduces=2;

select 
name,      
dept_num,   
employee_id,
salary    
from employee_contract
cluster by salary;

在这里插入图片描述

–group by

–每个部门所有员工的薪资总和

select
dept_num,sum(salary)
from employee_contract
group by dept_num;

在这里插入图片描述

select
dept_num,name,sum(salary)
from employee_contract
group by dept_num;

—如果select聚合字段前的字段比group by的字段多的话，会报错：Error: Error while compiling statement: FAILED: SemanticException [Error 10025]: Line 2:9 Expression not in GROUP BY key ‘name’ (state=42000,code=10025)

select
sum(salary)
from employee_contract
group by dept_num;

–select聚合字段前的字段可以没有，group by的字段随意写

在这里插入图片描述

–给薪资分组

select
if(salary>5000,'good','bad') as s,sum(salary)
from employee_contract
group by if(salary>5000,'good','bad');

在这里插入图片描述

–row_number()

–给所有员工薪资排序

–从低到高，针对所有员工

select name,dept_num,salary,
row_number() over(order by salary) as rn
from employee_contract;

在这里插入图片描述

–rank()

select name,dept_num,salary,
rank() over(order by salary) as rn
from employee_contract;

在这里插入图片描述

–dense_rank()

select name,dept_num,salary,
dense_rank() over(order by salary) as rn
from employee_contract;

在这里插入图片描述

–按部门，对每个部门的员工薪资排序

select name,dept_num,salary,
row_number() over(partition by dept_num order by salary) as rn
from employee_contract;

在这里插入图片描述

–按部门分组，获取每个部门薪资最低的员工（考虑重复）

–分组求top n

select *
from(
select name,dept_num,salary,
dense_rank() over(partition by dept_num order by salary) as rn
from employee_contract
)t
where t.rn in (1,2,3);

在这里插入图片描述

–ntile

select name,dept_num,salary,
ntile(2) over(partition by dept_num order by salary) as nt
from employee_contract;

在这里插入图片描述

–percent_rank

select name,dept_num,salary,
percent_rank() over(order by salary) as pr
from employee_contract;

在这里插入图片描述

–组合窗口函数使用

–窗口函数序列

SELECT 
name, dept_num, salary, 
ROW_NUMBER() OVER () AS row_num, 
RANK() OVER (PARTITION BY dept_num ORDER BY salary) AS rank, 
DENSE_RANK() OVER (PARTITION BY dept_num ORDER BY salary) AS dense_rank,
PERCENT_RANK() OVER(PARTITION BY dept_num ORDER BY salary) AS percent_rank, 
NTILE(2) OVER(PARTITION BY dept_num ORDER BY salary) AS ntile 
FROM employee_contract 
ORDER BY dept_num, salary;

在这里插入图片描述

–count

select name,dept_num,salary,
count(*) over(partition by dept_num) as rc
from employee_contract;

在这里插入图片描述

–sum

select name,dept_num,salary,
sum(salary) over(partition by dept_num order by salary) as rc
from employee_contract;

–分区内累计到当前行累加，salary相同时当成一个累加

在这里插入图片描述

select name,dept_num,salary,
sum(salary) over(order by salary) as rc
from employee_contract;

在这里插入图片描述

select name,dept_num,salary,
sum(salary) over(order by dept_num,name rows unbounded preceding) as rc
from employee_contract;

–累计到当前行累加

在这里插入图片描述

select name,dept_num,salary,
sum(salary) over(order by dept_num,name) as rc
from employee_contract;

–报错order by字段只能一个

Error: Error while compiling statement: FAILED: SemanticException Range based Window Frame can have only 1 Sort Key (state=42000,code=40000)

select name,dept_num,salary,
sum(salary) over(order by name) as rc
from employee_contract;

在这里插入图片描述

–求平均值，最大，最小

select name,dept_num,salary,
avg(salary) over(partition by dept_num) as avgDept,
min(salary) over(partition by dept_num) as minDept,
max(salary) over(partition by dept_num) as maxDept
from employee_contract;

在这里插入图片描述

–lead

select name,dept_num,salary,
lead(salary,1) over(partition by dept_num order by salary) as lead 
from employee_contract;

–分区内当前行的后1行,不存在则写null

在这里插入图片描述

select name,dept_num,salary,
lead(salary,1,0) over(partition by dept_num order by salary) as lead 
from employee_contract;

–分区内当前行的后1行,不存在则写0

在这里插入图片描述

–lag

select name,dept_num,salary,
lag(salary,1) over(partition by dept_num order by salary) as lag 
from employee_contract;

–分区内当前行的前1行,不存在则null

在这里插入图片描述

select name,dept_num,salary,
lag(salary,1,0) over(partition by dept_num order by salary) as lag 
from employee_contract;

–分区内当前行的前1行,不存在则写0

在这里插入图片描述

–first_value last_value

select name,dept_num,salary,
first_value() over(partition by dept_num order by salary) as fv,last_value() over(partition by dept_num order by salary) as lv
from employee_contract;

–报错first_value()中要写参数

–Error: Error while compiling statement: FAILED: NullPointerException null (state=42000,code=40000)

select name,dept_num,salary,
first_value(salary) over(partition by dept_num order by salary) as fv,last_value(salary) over(partition by dept_num order by salary) as lv
from employee_contract;

–fv：分区内截至到当前行的第一行

–lv：分区内截至到当前行的最后一行，也就是分区内的当前行

在这里插入图片描述

–cume_dist

select name,dept_num,salary,
cume_dist(salary) over(partition by dept_num order by salary) as cd
from employee_contract;

–报错cume_dist()中不能有参数

–Error: Error while compiling statement: FAILED: SemanticException Failed to breakup Windowing invocations into Groups. At least 1 group must only depend on input columns. Also check for circular dependencies.

Underlying error: Ranking Functions can take no arguments (state=42000,code=40000)

select name,dept_num,salary,
cume_dist() over(partition by dept_num order by salary) as cd
from employee_contract;

在这里插入图片描述

–range between

 select name,dept_num As dept, salary As sal,salary-1000 as s1,salary+1000 as s2 ,max (salary)OVER (PARTITION BY dept_num ORDER BY salary RANGE BETWEEN 1000 PRECEDING AND 1000 FOLLOWING) win1 from employee_contract

–win1:分区内的值在（当前行-100~当前行+1000）范围内的最大值

在这里插入图片描述

原文链接：https://blog.csdn.net/dontlikerabbit/article/details/111035515

你可能也喜欢