大数据查询Hive

content

hive

hive 是基于 Hadoop 的一个数据仓库工具，用来进行数据提取、转化、加载，这是一种可以存储、查询和分析存储在 Hadoop 中的大规模数据的机制。

2021-12-07_Hive架构原理

	Hive通过给用户提供的一系列交互接口，接收到用户的指令(SQL)，使用自己的Driver，结合元数据(MetaStore)，将这些指令翻译成MapReduce，提交到Hadoop中执行，最后，将执行返回的结果输出到用户交互接口。

1. 案例

1.1 AS 的使用

别名：与 MySQL 有区别

MySQL:  '' 引号
Hive ``

SELECT
	week_name AS `经营周`,
	start_time_string AS `开始时间`,
	end_time_string AS `结束时间`,
	device_code AS `设备名称`,
	project_name AS `楼盘名称`,
	city_name AS `城市名称`,
	area_name AS `城市区域`,
	address AS `地址`,
	surplus_rate AS `余量上刊率`,
	cash_rate AS `现金上刊率`,
	substitution_rate AS `置换上刊率`,
	give_rate AS `赠播上刊率`,
	comprehensive_rate AS `综合上刊率`
FROM
	(
	SELECT
		*,
		row_number ( ) over ( ORDER BY week_id, device_code ) rn
	FROM
		dm2.dm2_pv_result_point_rate
	WHERE
		week_id >= 2019011
		AND week_id <= 2020122 AND cus_type = 0 AND device_type = 1 AND cash_rate >= 0
	AND cash_rate < 9999.0
	) bb

2. Hive SQL

2.1 基本操作

# 1.删除表
drop table if exists passwords;
drop table if exists grpshell;

# 2.创建表
create table if not exists passwords (username string, passwd string, uid int, gid int, userinfo string, home string, shell string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ':' LINES TERMINATED BY '10';

# 3.加载数据（建立关联）
load data inpath '${hiveconf:inputFile}' into table passwords;

# 4.使用已有表数据复制到新表
create table if not exists grpshell (shell string, count int);
INSERT OVERWRITE TABLE grpshell SELECT p.shell, count(*) FROM passwords p GROUP BY p.shell;

# 5.创建分区 (基于具体的业务场景)
-- 通过 partitioned by 创建 父目录、子目录  (区分动态分区、静态分区 内部表与外部表)
create table if not exists table_name (
    c1 string comment 'c1',
    c2 string comment 'c2',
    c3 string comment 'c3'
 ) comment 'table_name' partitioned by (month_id string,day_id string)
 -- 可以定义行分隔符 row format delimited fields terminated by '|'

 stored as textfile -- parquet orc text;


 -- 6.创建外部表 (external)

-- 通过 partitioned by 创建 父目录、子目录  (区分动态分区、静态分区 内部表与外部表)
create external table if not exists table_name (
    c1 string comment 'c1',
    c2 string comment 'c2',
    c3 string comment 'c3'
 ) comment 'table_name' partitioned by (month_id string,day_id string)
 -- 可以定义行分隔符 row format delimited fields terminated by '|'

 stored as textfile -- parquet orc text;


 # 7.删除分区
 --  区分删除外部分区（删分区，只删除元数据，不删除目录及数据），内部表则会删除元数据，也会删除相应的目录和数据
alter table table_name drop partition(month_id='',day_id='')

-- 查看分区
show partitions table_name;

-- 动态、静态分区适用场景
-- 数据量不大，一般适用动态分区
--注意：以下设置，只在当前的会话窗口有效
-- 1.打开动态分区模式：
set hive.exec.dynamic.partition=true;
-- 2.设置分区模式为非严格模式
set hive.exec.dynamic.partition.mode=nonstrict;


# 8.区分insert overwrite /insert into

-- 覆盖插入 (覆盖202309/22目录下的所有数据)
insert overwrite table table_name partition(month_id='202309',day_id='22')
select * from source_table_name;

-- 追加插入 (插入到静态分区父目录为202309，子目录为22下)

insert into table table_name partition(month_id='202309',day_id='22')
select * from source_table_name;

2.2 常用函数

-- 1. 时间函数

select current_date;

select current_timestamp;

select date_sub(current_date,1);

select date_add(current_date,1);
-- 时间差值
select date_diff(current_date,current_date);

-- 2. 字符函数

select substr(current_date,1,7);

2.3 写文件示例

//单行终止符（换行符）
String lineTerminated = "\r\n";
//字段间隔符
String fieldsTerminated = "\001";
/**
 * 通过url获取数据
 * @param url url
 * @return 获取字节流
 */
InputStream getDataFromApi(String url) {
        log.info("url-{},token-{}",url,TOKEN);
        JSONObject r = HttpClientUtil.httpPost(url, TOKEN);
        if (null == r) {
            log.info("接口调用无数据……");
            return null;
        }
        BaseResltPOJO p = (BaseResltPOJO) r.toJavaObject(BaseResltPOJO.class);
        log.info("调用接口获取数据结果集：{}",p);
        int i = 0;
        StringBuilder sb = new StringBuilder();
        String format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date());
        for (Iterator localIterator1 = p.getData().iterator(); localIterator1.hasNext(); ) 		{
            BasePOJO pd = (BasePOJO) localIterator1.next();
            if (!pd.getCode().isEmpty()) {
                i++;
                sb.append(pd.getCode())
                        .append(this.fieldsTerminated)
                        .append(pd.getName())
                        .append(this.fieldsTerminated)
                        .append(format).append(this.lineTerminated);
            }
        }
        log.info("处理了" + i + "数据……");
        return new ByteArrayInputStream(sb.toString().getBytes());
    }