- content
hive
hive 是基于 Hadoop 的一个数据仓库工具,用来进行数据提取、转化、加载,这是一种可以存储、查询和分析存储在 Hadoop 中的大规模数据的机制。
Hive通过给用户提供的一系列交互接口,接收到用户的指令(SQL),使用自己的Driver,结合元数据(MetaStore),将这些指令翻译成MapReduce,提交到Hadoop中执行,最后,将执行返回的结果输出到用户交互接口。
1. 案例
1.1 AS 的使用
别名: 与 MySQL 有区别
MySQL: '' 引号
Hive ``
SELECT
week_name AS `经营周`,
start_time_string AS `开始时间`,
end_time_string AS `结束时间`,
device_code AS `设备名称`,
project_name AS `楼盘名称`,
city_name AS `城市名称`,
area_name AS `城市区域`,
address AS `地址`,
surplus_rate AS `余量上刊率`,
cash_rate AS `现金上刊率`,
substitution_rate AS `置换上刊率`,
give_rate AS `赠播上刊率`,
comprehensive_rate AS `综合上刊率`
FROM
(
SELECT
*,
row_number ( ) over ( ORDER BY week_id, device_code ) rn
FROM
dm2.dm2_pv_result_point_rate
WHERE
week_id >= 2019011
AND week_id <= 2020122 AND cus_type = 0 AND device_type = 1 AND cash_rate >= 0
AND cash_rate < 9999.0
) bb
2. Hive SQL
2.1 基本操作
# 1.删除表
drop table if exists passwords;
drop table if exists grpshell;
# 2.创建表
create table if not exists passwords (username string, passwd string, uid int, gid int, userinfo string, home string, shell string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ':' LINES TERMINATED BY '10';
# 3.加载数据(建立关联)
load data inpath '${hiveconf:inputFile}' into table passwords;
# 4.使用已有表数据复制到新表
create table if not exists grpshell (shell string, count int);
INSERT OVERWRITE TABLE grpshell SELECT p.shell, count(*) FROM passwords p GROUP BY p.shell;
# 5.创建分区 (基于具体的业务场景)
-- 通过 partitioned by 创建 父目录、子目录 (区分动态分区、静态分区 内部表与外部表)
create table if not exists table_name (
c1 string comment 'c1',
c2 string comment 'c2',
c3 string comment 'c3'
) comment 'table_name' partitioned by (month_id string,day_id string)
-- 可以定义行分隔符 row format delimited fields terminated by '|'
stored as textfile -- parquet orc text;
-- 6.创建外部表 (external)
-- 通过 partitioned by 创建 父目录、子目录 (区分动态分区、静态分区 内部表与外部表)
create external table if not exists table_name (
c1 string comment 'c1',
c2 string comment 'c2',
c3 string comment 'c3'
) comment 'table_name' partitioned by (month_id string,day_id string)
-- 可以定义行分隔符 row format delimited fields terminated by '|'
stored as textfile -- parquet orc text;
# 7.删除分区
-- 区分删除外部分区(删分区,只删除元数据,不删除目录及数据),内部表则会删除元数据,也会删除相应的目录和数据
alter table table_name drop partition(month_id='',day_id='')
-- 查看分区
show partitions table_name;
-- 动态、静态分区适用场景
-- 数据量不大,一般适用动态分区
--注意:以下设置,只在当前的会话窗口有效
-- 1.打开动态分区模式:
set hive.exec.dynamic.partition=true;
-- 2.设置分区模式为非严格模式
set hive.exec.dynamic.partition.mode=nonstrict;
# 8.区分insert overwrite /insert into
-- 覆盖插入 (覆盖202309/22目录下的所有数据)
insert overwrite table table_name partition(month_id='202309',day_id='22')
select * from source_table_name;
-- 追加插入 (插入到静态分区父目录为202309,子目录为22下)
insert into table table_name partition(month_id='202309',day_id='22')
select * from source_table_name;
2.2 常用函数
-- 1. 时间函数
select current_date;
select current_timestamp;
select date_sub(current_date,1);
select date_add(current_date,1);
-- 时间差值
select date_diff(current_date,current_date);
-- 2. 字符函数
select substr(current_date,1,7);
2.3 写文件示例
//单行终止符(换行符)
String lineTerminated = "\r\n";
//字段间隔符
String fieldsTerminated = "\001";
/**
* 通过url获取数据
* @param url url
* @return 获取字节流
*/
InputStream getDataFromApi(String url) {
log.info("url-{},token-{}",url,TOKEN);
JSONObject r = HttpClientUtil.httpPost(url, TOKEN);
if (null == r) {
log.info("接口调用无数据……");
return null;
}
BaseResltPOJO p = (BaseResltPOJO) r.toJavaObject(BaseResltPOJO.class);
log.info("调用接口获取数据结果集:{}",p);
int i = 0;
StringBuilder sb = new StringBuilder();
String format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date());
for (Iterator localIterator1 = p.getData().iterator(); localIterator1.hasNext(); ) {
BasePOJO pd = (BasePOJO) localIterator1.next();
if (!pd.getCode().isEmpty()) {
i++;
sb.append(pd.getCode())
.append(this.fieldsTerminated)
.append(pd.getName())
.append(this.fieldsTerminated)
.append(format).append(this.lineTerminated);
}
}
log.info("处理了" + i + "数据……");
return new ByteArrayInputStream(sb.toString().getBytes());
}
参考
全方位测评 Hive、SparkSQL、Presto 等七个大数据查询引擎
https://github.com/spring-projects/spring-hadoop-samples