概述

本文介绍使用 Hive 构建离线数仓,统计活跃会员(日活/周活/月活)的完整流程,从 DWD 明细层到 DWS 汇总层再到 ADS 应用层。

核心概念

活跃会员:打开应用的会员即为活跃会员

活跃会员指标需求:每日、每周、每月的活跃会员数

数据流向

  • DWD:会员的每日启动信息明细
  • DWS:每日/每周/每月活跃会员信息
  • ADS:每日、每周、每月活跃会员数(输出)

DWS 层建表脚本

use dws;
drop table if exists dws.dws_member_start_day;
create table dws.dws_member_start_day
(
  `device_id` string,
  `uid` string,
  `app_v` string,
  `os_type` string,
  `language` string,
  `channel` string,
  `area` string,
  `brand` string
) COMMENT '会员日启动汇总'
partitioned by(dt string)
stored as parquet;

drop table if exists dws.dws_member_start_week;
create table dws.dws_member_start_week(
  `device_id` string,
  `uid` string,
  `app_v` string,
  `os_type` string,
  `language` string,
  `channel` string,
  `area` string,
  `brand` string,
  `week` string
) COMMENT '会员周启动汇总'
PARTITIONED BY (`dt` string)
stored as parquet;

drop table if exists dws.dws_member_start_month;
create table dws.dws_member_start_month(
  `device_id` string,
  `uid` string,
  `app_v` string,
  `os_type` string,
  `language` string,
  `channel` string,
  `area` string,
  `brand` string,
  `month` string
) COMMENT '会员月启动汇总'
PARTITIONED BY (`dt` string)
stored as parquet;

DWS 层数据加载脚本

#!/bin/bash
source /etc/profile
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi

sql="
insert overwrite table dws.dws_member_start_day
partition(dt='$do_date')
select device_id,
concat_ws('|', collect_set(uid)),
concat_ws('|', collect_set(app_v)),
concat_ws('|', collect_set(os_type)),
concat_ws('|', collect_set(language)),
concat_ws('|', collect_set(channel)),
concat_ws('|', collect_set(area)),
concat_ws('|', collect_set(brand))
from dwd.dwd_start_log
where dt='$do_date'
group by device_id;

-- 汇总得到每周活跃会员
insert overwrite table dws.dws_member_start_week
partition(dt='$do_date')
select device_id,
concat_ws('|', collect_set(uid)),
concat_ws('|', collect_set(app_v)),
concat_ws('|', collect_set(os_type)),
concat_ws('|', collect_set(language)),
concat_ws('|', collect_set(channel)),
concat_ws('|', collect_set(area)),
concat_ws('|', collect_set(brand)),
date_add(next_day('$do_date', 'mo'), -7)
from dws.dws_member_start_day
where dt >= date_add(next_day('$do_date', 'mo'), -7)
and dt <= '$do_date'
group by device_id;

-- 汇总得到每月活跃会员
insert overwrite table dws.dws_member_start_month
partition(dt='$do_date')
select device_id,
concat_ws('|', collect_set(uid)),
concat_ws('|', collect_set(app_v)),
concat_ws('|', collect_set(os_type)),
concat_ws('|', collect_set(language)),
concat_ws('|', collect_set(channel)),
concat_ws('|', collect_set(area)),
concat_ws('|', collect_set(brand)),
date_format('$do_date', 'yyyy-MM')
from dws.dws_member_start_day
where dt >= date_format('$do_date', 'yyyy-MM-01')
and dt <= '$do_date'
group by device_id;
"
hive -e "$sql"

ADS 层建表脚本

drop table if exists ads.ads_member_active_count;
create table ads.ads_member_active_count(
  `day_count` int COMMENT '当日会员数量',
  `week_count` int COMMENT '当周会员数量',
  `month_count` int COMMENT '当月会员数量'
) COMMENT '活跃会员数'
partitioned by(dt string)
row format delimited fields terminated by ',';

ADS 层数据加载脚本(方式一:WITH + UNION ALL)

#!/bin/bash
source /etc/profile
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi

sql="
with tmp as(
  select 'day' datelabel, count(*) cnt, dt
  from dws.dws_member_start_day
  where dt='$do_date'
  group by dt
  union all
  select 'week' datelabel, count(*) cnt, dt
  from dws.dws_member_start_week
  where dt='$do_date'
  group by dt
  union all
  select 'month' datelabel, count(*) cnt, dt
  from dws.dws_member_start_month
  where dt='$do_date'
  group by dt
)
insert overwrite table ads.ads_member_active_count
partition(dt='$do_date')
select sum(case when datelabel='day' then cnt end) as day_count,
sum(case when datelabel='week' then cnt end) as week_count,
sum(case when datelabel='month' then cnt end) as month_count
from tmp
group by dt;
"
hive -e "$sql"

ADS 层数据加载脚本(方式二:JOIN)

#!/bin/bash
source /etc/profile
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi

sql="
insert overwrite table ads.ads_member_active_count
partition(dt='$do_date')
select daycnt, weekcnt, monthcnt
from (select dt, count(*) daycnt
      from dws.dws_member_start_day
      where dt='$do_date'
      group by dt
     ) day join
(select dt, count(*) weekcnt
 from dws.dws_member_start_week
 where dt='$do_date'
 group by dt
) week on day.dt=week.dt
join
(select dt, count(*) monthcnt
 from dws.dws_member_start_month
 where dt='$do_date'
 group by dt
) month on day.dt=month.dt;
"
hive -e "$sql"

两种 ADS 脚本对比

  • 方式一(WITH + UNION ALL):通过构建临时表将不同维度的数据汇总,再通过 SUM 计算最终结果,方式灵活,便于扩展
  • 方式二(JOIN):直接通过 JOIN 将天、周、月三个维度的数据联结在一起,性能可能更高效,但扩展性稍差

常见错误速查

症状根因定位修复
Shell 脚本执行报 “bad interpreter”shebang 写成 #!/bin/bash(全角感叹号)或脚本含 Windows 换行改为 #!/bin/bashsed -i 's/\r$//' script.sh
DWS 日表里 uid/app_v 等变成 abc 长串用 collect_set 拼接字符串是”存展示字段”避免直接使用 collect_set 拼接长字符串
周活跃口径不稳定周表 partition(dt=跑批日),但 week 范围用”周一到 do_date”的滚动窗口周汇总建议按 week 分区/主键输出
next_day 在部分环境报错weekday 参数依赖实现/语言环境统一用明确写法如 next_day(date_sub('$do_date',7),'MO')
ADS 产出为 NULL 或 0dt 分区值不一致、跑批日无数据确保 ODS/DWD/DWS 同一 do_date
Hive 执行报解析错误多语句拼接导致截断每句以 ; 结束并换行