大数据开发工程师-第十三周 综合项目:电商数据仓库之商品订单数仓


第十三周 综合项目:电商数据仓库之商品订单数仓

1
2
3
和之前用户行为数仓构建步骤一样,先对下面两层进行构建,上面两层基于业务需求来构建这两层的表

服务端数据在mysql中的表如下:(已通过sqoop抽取到hdfs上,只需要创建表将数据关联起来就可以了)

image-20230405224934100

1
hdfs上商品订单数据相关目录

image-20230405230954102

image-20230405231024155

image-20230405231205937

image-20230405231347893

ods层

image-20230405225123910

建表脚本

ods_mall_init_table.sh

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/bin/bash
# ods层数据库和表初始化脚本,只需要执行一次即可

hive -e "
create database if not exists ods_mall;

create external table if not exists ods_mall.ods_user(
user_id bigint,
user_name string,
user_gender tinyint,
user_birthday string,
e_mail string,
mobile string,
register_time string,
is_blacklist tinyint
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/ods/user/';

create external table if not exists ods_mall.ods_user_extend(
user_id bigint,
is_pregnant_woman tinyint,
is_have_children tinyint,
is_have_car tinyint,
phone_brand string,
phone_cnt int,
change_phone_cnt int,
weight int,
height int
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/ods/user_extend/';

create external table if not exists ods_mall.ods_user_addr(
addr_id bigint,
user_id bigint,
addr_name string,
order_flag tinyint,
user_name string,
mobile string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/ods/user_addr/';

create external table if not exists ods_mall.ods_goods_info(
goods_id bigint,
goods_no string,
goods_name string,
curr_price double,
third_category_id int,
goods_desc string,
create_time string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/ods/goods_info/';

create external table if not exists ods_mall.ods_category_code(
first_category_id int,
first_category_name string,
second_category_id int,
second_catery_name string,
third_category_id int,
third_category_name string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/ods/category_code/';

create external table if not exists ods_mall.ods_user_order(
order_id bigint,
order_date string,
user_id bigint,
order_money double,
order_type int,
order_status int,
pay_id bigint,
update_time string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/ods/user_order/';

create external table if not exists ods_mall.ods_order_item(
order_id bigint,
goods_id bigint,
goods_amount int,
curr_price double,
create_time string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/ods/order_item/';

create external table if not exists ods_mall.ods_order_delivery(
order_id bigint,
addr_id bigint,
user_id bigint,
carriage_money double,
create_time string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/ods/order_delivery/';

create external table if not exists ods_mall.ods_payment_flow(
pay_id bigint,
order_id bigint,
trade_no bigint,
pay_money double,
pay_type int,
pay_time string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/ods/payment_flow/';

"

ods_mall_add_partition.sh

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/bin/bash
# 给ods层的表添加分区,这个脚本后期每天执行一次
# 每天凌晨,添加昨天的分区,添加完分区之后,再执行后面的计算脚本

# 默认获取昨天的日期,也支持传参指定一个日期
if [ "z$1" = "z" ]
then
dt=`date +%Y%m%d --date="1 days ago"`
else
dt=$1
fi


#alter table ods_mall.ods_user add if not exists partition(dt='20260101') location '20260101';
#alter table ods_mall.ods_user_extend add if not exists partition(dt='20260101') location '20260101';
#alter table ods_mall.ods_user_addr add if not exists partition(dt='20260101') location '20260101';
#alter table ods_mall.ods_goods_info add if not exists partition(dt='20260101') location '20260101';
#alter table ods_mall.ods_category_code add if not exists partition(dt='20260101') location '20260101';
#alter table ods_mall.ods_user_order add if not exists partition(dt='20260101') location '20260101';
#alter table ods_mall.ods_order_item add if not exists partition(dt='20260101') location '20260101';
#alter table ods_mall.ods_order_delivery add if not exists partition(dt='20260101') location '20260101';
#alter table ods_mall.ods_payment_flow add if not exists partition(dt='20260101') location '20260101';

# 这里用了开发用户行为数仓时用的添加分区通用脚本
sh add_partition.sh ods_mall.ods_user ${dt} ${dt}
sh add_partition.sh ods_mall.ods_user_extend ${dt} ${dt}
sh add_partition.sh ods_mall.ods_user_addr ${dt} ${dt}
sh add_partition.sh ods_mall.ods_goods_info ${dt} ${dt}
sh add_partition.sh ods_mall.ods_category_code ${dt} ${dt}
sh add_partition.sh ods_mall.ods_user_order ${dt} ${dt}
sh add_partition.sh ods_mall.ods_order_item ${dt} ${dt}
sh add_partition.sh ods_mall.ods_order_delivery ${dt} ${dt}
sh add_partition.sh ods_mall.ods_payment_flow ${dt} ${dt}

dwd层

1
其实可以直接对ods中对应的表进行抽取字段,但为了保险起见,对各个表中的id字段(是各个表的主键)进行过滤,因为数据采集过程中可能id字段会出现问题

建表脚本

dwd_mall_init_table.sh

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/bin/bash
# dwd层数据库和表初始化脚本,只需要执行一次即可

hive -e "
create database if not exists dwd_mall;

create external table if not exists dwd_mall.dwd_user(
user_id bigint,
user_name string,
user_gender tinyint,
user_birthday string,
e_mail string,
mobile string,
register_time string,
is_blacklist tinyint
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/dwd/user/';



create external table if not exists dwd_mall.dwd_user_extend(
user_id bigint,
is_pregnant_woman tinyint,
is_have_children tinyint,
is_have_car tinyint,
phone_brand string,
phone_cnt int,
change_phone_cnt int,
weight int,
height int
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/dwd/user_extend/';



create external table if not exists dwd_mall.dwd_user_addr(
addr_id bigint,
user_id bigint,
addr_name string,
order_flag tinyint,
user_name string,
mobile string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/dwd/user_addr/';



create external table if not exists dwd_mall.dwd_goods_info(
goods_id bigint,
goods_no string,
goods_name string,
curr_price double,
third_category_id int,
goods_desc string,
create_time string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/dwd/goods_info/';



create external table if not exists dwd_mall.dwd_category_code(
first_category_id int,
first_category_name string,
second_category_id int,
second_catery_name string,
third_category_id int,
third_category_name string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/dwd/category_code/';


create external table if not exists dwd_mall.dwd_user_order(
order_id bigint,
order_date string,
user_id bigint,
order_money double,
order_type int,
order_status int,
pay_id bigint,
update_time string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/dwd/user_order/';


create external table if not exists dwd_mall.dwd_order_item(
order_id bigint,
goods_id bigint,
goods_amount int,
curr_price double,
create_time string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/dwd/order_item/';


create external table if not exists dwd_mall.dwd_order_delivery(
order_id bigint,
addr_id bigint,
user_id bigint,
carriage_money double,
create_time string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/dwd/order_delivery/';



create external table if not exists dwd_mall.dwd_payment_flow(
pay_id bigint,
order_id bigint,
trade_no bigint,
pay_money double,
pay_type int,
pay_time string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/dwd/payment_flow/';

"

dwd_mall_add_partition.sh

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/bin/bash
# 基于ods层的表进行清洗,将清洗之后的数据添加到dwd层对应表的对应分区中
# 每天凌晨执行一次

# 默认获取昨天的日期,也支持传参指定一个日期
if [ "z$1" = "z" ]
then
dt=`date +%Y%m%d --date="1 days ago"`
else
dt=$1
fi

hive -e "

insert overwrite table dwd_mall.dwd_user partition(dt='${dt}') select
user_id,
user_name,
user_gender,
user_birthday,
e_mail,
mobile,
register_time,
is_blacklist
from ods_mall.ods_user
where dt = '${dt}' and user_id is not null;


insert overwrite table dwd_mall.dwd_user_extend partition(dt='${dt}') select
user_id,
is_pregnant_woman,
is_have_children,
is_have_car,
phone_brand,
phone_cnt,
change_phone_cnt,
weight,
height
from ods_mall.ods_user_extend
where dt = '${dt}' and user_id is not null;


insert overwrite table dwd_mall.dwd_user_addr partition(dt='${dt}') select
addr_id,
user_id,
addr_name,
order_flag,
user_name,
mobile
from ods_mall.ods_user_addr
where dt = '${dt}' and addr_id is not null;



insert overwrite table dwd_mall.dwd_goods_info partition(dt='${dt}') select
goods_id,
goods_no,
goods_name,
curr_price,
third_category_id,
goods_desc,
create_time
from ods_mall.ods_goods_info
where dt = '${dt}' and goods_id is not null;


insert overwrite table dwd_mall.dwd_category_code partition(dt='${dt}') select
first_category_id,
first_category_name,
second_category_id,
second_catery_name,
third_category_id,
third_category_name
from ods_mall.ods_category_code
where dt = '${dt}' and first_category_id is not null;


insert overwrite table dwd_mall.dwd_user_order partition(dt='${dt}') select
order_id,
order_date,
user_id,
order_money,
order_type,
order_status,
pay_id,
update_time
from ods_mall.ods_user_order
where dt = '${dt}' and order_id is not null;


insert overwrite table dwd_mall.dwd_order_item partition(dt='${dt}') select
order_id,
goods_id,
goods_amount,
curr_price,
create_time
from ods_mall.ods_order_item
where dt = '${dt}' and order_id is not null;


insert overwrite t able dwd_mall.dwd_order_delivery partition(dt='${dt}') select
order_id,
addr_id,
user_id,
carriage_money,
create_time
from ods_mall.ods_order_delivery
where dt = '${dt}' and order_id is not null;


insert overwrite table dwd_mall.dwd_payment_flow partition(dt='${dt}') select
pay_id,
order_id,
trade_no,
pay_money,
pay_type,
pay_time
from ods_mall.ods_payment_flow
where dt = '${dt}' and order_id is not null;
"

image-20230405233300223

image-20230405233549286

image-20230405233613163

image-20230405233743838

需求分析

1
2
3
4
5
6
7
需求一:用户信息宽表(后期查询用户信息时只需在一张表里查询,不需要关联多张表)
需求二:电商GMV(电商领域很常见,表示一段时间内的交易总金额)
需求三:商品相关指标(商品销售情况,比如哪些受用户喜欢)
需求四:用户行为漏斗分析

为了保证大家在练习的时候计算的需求结果和我的保持一致,所以针对后面的测试数据就不再随机生成了,而是生成固定的数据,一共1个月的数据
从2026-02-01到2026-02-28的数据

image-20230405234430452

image-20230405234656212

image-20230405234927830

1
2
3
4
执行上面的代码,此时数据就会被上传到HDFS上面。
然后执行ods层和dwd层的脚本,重新加载计算2026-02月份的数据。
1:执行ods层的脚本
写一个临时脚本,在脚本中写一个for循环,循环加载数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#!/bin/bash
# 加载ods层的数据

for((i=1;i<=28;i++))
do
if [ $i -lt 10 ]
then
dt="2026020"$i
else
dt="202602"$i
fi
echo "ods_mall_add_partition.sh" ${dt}
sh ods_mall_add_partition.sh ${dt}
done
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#!/bin/bash
# 计算dwd层的数据

for((i=1;i<=28;i++))
do
if [ $i -lt 10 ]
then
dt="2026020"$i
else
dt="202602"$i
fi
echo "dwd_mall_add_partition.sh" ${dt}
sh dwd_mall_add_partition.sh ${dt}
done
1
等脚本执行完毕之后,验证一下结果,随便找一个日期的数据验证即可,能查到数据就说明是OK的。最好是再查看一下 partition的信息。

image-20230405235827525

image-20230405235349443

image-20230405235406590

image-20230405235510464

image-20230405235640110

需求一:用户信息宽表

1
2
3
4
5
宽表主要是便于使用,在使用的时候不至于每次都要关联很多张表
用户信息宽表包括服务端中的user表、user_extend表
如果有需求的话其实还可以把用户的一些其它维度的数据关联过来,例如:当日的下单数量、消费金额等等指标,
实现思路如下:
对dwd_user表和dwd_user_extend表执行left join操作,通过user_id进行关联即可,将结果数据保存到 dws_user_info_all表中

dws层

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
create external table if not exists dws_mall.dws_user_info_all(
user_id bigint,
user_name string,
user_gender tinyint,
user_birthday string,
e_mail string,
mobile string,
register_time string,
is_blacklist tinyint,
is_pregnant_woman tinyint,
is_have_children tinyint,
is_have_car tinyint,
phone_brand string,
phone_cnt int,
change_phone_cnt int,
weight int,
height int
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/dws/user_info_all/';


insert overwrite table dws_mall.dws_user_info_all partition(dt='20260201') select
du.user_id,
du.user_name,
du.user_gender,
du.user_birthday,
du.e_mail,
du.mobile,
du.register_time,
du.is_blacklist,
due.is_pregnant_woman,
due.is_have_children,
due.is_have_car,
due.phone_brand,
due.phone_cnt,
due.change_phone_cnt,
due.weight,
due.height
from dwd_mall.dwd_user as du
left join dwd_mall.dwd_user_extend as due
on du.user_id = due.user_id
where du.dt = '20260201' and due.dt = '20260201';
建表脚本
1
2
3
4
5
针对 dws 层抽取脚本
1:表初始化脚本(初始化执行一次)
dws_mall_init_table_1.sh
2:添加分区数据脚本(每天执行一次)
dws_mall_add_partition_1.sh
dws_mall_init_table_1.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/bin/bash
# 需求一:用户信息宽表
# dws层数据库和表初始化脚本,只需要执行一次即可

hive -e "
create database if not exists dws_mall;

create external table if not exists dws_mall.dws_user_info_all(
user_id bigint,
user_name string,
user_gender tinyint,
user_birthday string,
e_mail string,
mobile string,
register_time string,
is_blacklist tinyint,
is_pregnant_woman tinyint,
is_have_children tinyint,
is_have_car tinyint,
phone_brand string,
phone_cnt int,
change_phone_cnt int,
weight int,
height int
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/dws/user_info_all/';
"
dws_mall_add_partition_1.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/bin/bash
# 需求一:用户信息宽表
# 每天凌晨执行一次

# 默认获取昨天的日期,也支持传参指定一个日期
if [ "z$1" = "z" ]
then
dt=`date +%Y%m%d --date="1 days ago"`
else
dt=$1
fi

hive -e "
insert overwrite table dws_mall.dws_user_info_all partition(dt='${dt}') select
du.user_id,
du.user_name,
du.user_gender,
du.user_birthday,
du.e_mail,
du.mobile,
du.register_time,
du.is_blacklist,
due.is_pregnant_woman,
due.is_have_children,
due.is_have_car,
due.phone_brand,
due.phone_cnt,
due.change_phone_cnt,
due.weight,
due.height
from dwd_mall.dwd_user as du
left join dwd_mall.dwd_user_extend as due
on du.user_id = due.user_id
where du.dt = '${dt}' and due.dt = '${dt}';
"

image-20230406001420794

image-20230406001456982

image-20230406001541136

1
2
3
4
5
问题:如何将服务端的数据和客户端的数据通过用户维度关联起来?
之前统计的客户端数据(用户行为数据)针对用户相关的指标都是使用的xaid(因为用户在使用app时,不一定登陆了,所以没有用user_id)。
但是在服务端数据中用户信息只有user_id。
这两份数据如果先要关联起来,还需要在用户行为数仓中提取一个表,表里面只需要有两列:
user_id和xaid。这样就可以把客户端数据和服务端数据中的用户关联起来了。

需求二:电商GMV

1
2
3
4
5
6
GMV:Gross Merchandise Volume,是指一定时间段内的成交总金额
GMV多用于电商行业,这个实际指的是拍下的订单总金额,包含付款和未付款的部分。
我们在统计的时候就可以将订单表中的每天的所有订单金额全部都累加起来就可以获取到当天的GMV了

实现思路:
对dwd_user_order表中的数据进行统计即可,通过order_money字段可以计算出来GMV,将结果数据保存到表 app_gmv中

app层

1
2
3
注意:gmv字段的类型可以使用double或者decimal(10,2)都可以
decimal(10,2),可以更方便的控制小数位数,数据看起来更加清晰,所以建议使用decimal(10,2)。
其实针对金钱相关的字段类型建议使用decimal(10,2),使用double也不错,也是可以的。
1
2
3
4
5
6
7
8
9
10
11
12
13
create external table if not exists app_mall.app_gmv(
gmv decimal(10,2)
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/app/gmv/';



insert overwrite table app_mall.app_gmv partition(dt='20260201') select
sum(order_money) as gmv
from dwd_mall.dwd_user_order
where dt = '20260201';
建表脚本
1
2
3
4
5
针对 app 层抽取脚本
1:表初始化脚本(初始化执行一次)
app_mall_init_table_2.sh
2:添加分区数据脚本(每天执行一次)
app_mall_add_partition_2.sh
app_mall_init_table_2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#!/bin/bash
# 需求二:电商GMV
# app层数据库和表初始化脚本,只需要执行一次即可

hive -e "
create database if not exists app_mall;

create external table if not exists app_mall.app_gmv(
gmv decimal(10,2)
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/app/gmv/';
"
app_mall_add_partition_2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#!/bin/bash
# 需求二:电商GMV
# 每天凌晨执行一次

# 默认获取昨天的日期,也支持传参指定一个日期
if [ "z$1" = "z" ]
then
dt=`date +%Y%m%d --date="1 days ago"`
else
dt=$1
fi


hive -e "
insert overwrite table app_mall.app_gmv partition(dt='${dt}') select
sum(order_money) as gmv
from dwd_mall.dwd_user_order
where dt = '${dt}';
"
1
2
3
问题:有了每天的GMV,后期能不能计算月度GMV,季度GMV,以及年度GMV?以及GMV的同比,环比?
可以的,有了每日的GMV之后,按照月、季度、年对GMV进行聚合即可。
同比环比,我们在前面已经实现过类似的需求。

image-20230406003729623

image-20230406003754317

image-20230406003824047

需求三:商品相关指标

1
2
3
4
5
6
7
8
9
10
11
12
13
14
针对这个需求我们主要统计两个指标
1:商品销售情况(商品名称,一级类目,订单总量,销售额)
2:商品品类偏好Top10(商品一级类目,订单总量)

首先看第一个指标:商品销售情况
这个指标主要统计商品名称,一级类目,订单总量,销售额这些字段信息
订单中的详细信息是在dwd_order_item表中,需要关联dwd_goods_info和
dwd_category_code获取商品名称和商品一级类目信息

在这最好是基于这些表先构建一个商品订单信息的宽表dws_order_goods_all_info,便于后期其它需求复用。
然后基于这个宽表统计出来这个指标需要的信息,保存到表app_goods_sales_item

接着看第二个指标:商品品类偏好Top10
这个指标可以在第一个指标的基础之上,根据一级类目进行分组,按照类目下的订单总量排序,取Top10,保存到表 app_category_top10中

dws层

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
create external table if not exists dws_mall.dws_order_goods_all_info(
order_id bigint,
goods_id bigint,
goods_amount int,
order_curr_price double,
order_create_time string,
goods_no string,
goods_name string,
goods_curr_price double,
goods_desc string,
goods_create_time string,
first_category_id int,
first_category_name string,
second_category_id int,
second_catery_name string,
third_category_id int,
third_category_name string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/dws/order_goods_all_info/';

insert overwrite table dws_mall.dws_order_goods_all_info partition(dt='20260201') select
doi.order_id,
doi.goods_id,
doi.goods_amount,
doi.curr_price as order_curr_price,
doi.create_time as order_create_time,
dgi.goods_no,
dgi.goods_name,
dgi.curr_price as goods_curr_price,
dgi.goods_desc,
dgi.create_time as goods_create_time,
dcc.first_category_id,
dcc.first_category_name,
dcc.second_category_id,
dcc.second_catery_name,
dcc.third_category_id,
dcc.third_category_name
from dwd_mall.dwd_order_item as doi
left join dwd_mall.dwd_goods_info as dgi
on doi.goods_id = dgi.goods_id
left join dwd_mall.dwd_category_code as dcc
on dgi.third_category_id = dcc.third_category_id
where doi.dt = '20260201' and dgi.dt = '20260201' and dcc.dt = '20260201';
开发脚本
1
2
3
4
5
针对 dws 层抽取脚本
1:表初始化脚本(初始化执行一次)
dws_mall_init_table_3.sh
2:添加分区数据脚本(每天执行一次)
dws_mall_add_partition_3.sh
dws_mall_init_table_3.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/bin/bash
# 需求三:商品相关指标
# dws层数据库和表初始化脚本,只需要执行一次即可

hive -e "
create database if not exists dws_mall;

create external table if not exists dws_mall.dws_order_goods_all_info(
order_id bigint,
goods_id bigint,
goods_amount int,
order_curr_price double,
order_create_time string,
goods_no string,
goods_name string,
goods_curr_price double,
goods_desc string,
goods_create_time string,
first_category_id int,
first_category_name string,
second_category_id int,
second_catery_name string,
third_category_id int,
third_category_name string
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/dws/order_goods_all_info/';
"
dws_mall_add_partition_3.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/bin/bash
# 需求三:商品相关指标
# 每天凌晨执行一次

# 默认获取昨天的日期,也支持传参指定一个日期
if [ "z$1" = "z" ]
then
dt=`date +%Y%m%d --date="1 days ago"`
else
dt=$1
fi

hive -e "
insert overwrite table dws_mall.dws_order_goods_all_info partition(dt='${dt}') select
doi.order_id,
doi.goods_id,
doi.goods_amount,
doi.curr_price as order_curr_price,
doi.create_time as order_create_time,
dgi.goods_no,
dgi.goods_name,
dgi.curr_price as goods_curr_price,
dgi.goods_desc,
dgi.create_time as goods_create_time,
dcc.first_category_id,
dcc.first_category_name,
dcc.second_category_id,
dcc.second_catery_name,
dcc.third_category_id,
dcc.third_category_name
from dwd_mall.dwd_order_item as doi
left join dwd_mall.dwd_goods_info as dgi
on doi.goods_id = dgi.goods_id
left join dwd_mall.dwd_category_code as dcc
on dgi.third_category_id = dcc.third_category_id
where doi.dt = '${dt}' and dgi.dt = '${dt}' and dcc.dt = '${dt}';
"

app层

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
create external table if not exists app_mall.app_goods_sales_item(
goods_name string,
first_category_name string,
order_total bigint,
price_total decimal(10,2)
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/app/goods_sales_item/';

insert overwrite table app_mall.app_goods_sales_item partition(dt='20260201') select
goods_name,
first_category_name,
count(order_id) as order_total,
sum(goods_amount * order_curr_price) as price_total
from dws_mall.dws_order_goods_all_info
where dt = '20260201'
group by goods_name,first_category_name;




create external table if not exists app_mall.app_category_top10(
first_category_name string,
order_total bigint
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/app/category_top10/';

insert overwrite table app_mall.app_category_top10 partition(dt='20260201') select
first_category_name,
sum(order_total) as order_total
from app_mall.app_goods_sales_item
where dt ='20260201'
group by first_category_name
order by order_total desc
limit 10;
开发脚本
1
2
3
4
5
针对 app 层抽取脚本
1:表初始化脚本(初始化执行一次)
app_mall_init_table_3.sh
2:添加分区数据脚本(每天执行一次)
app_mall_add_partition_3.sh
app_mall_init_table_3.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#!/bin/bash
# 需求三:商品相关指标
# app层数据库和表初始化脚本,只需要执行一次即可

hive -e "
create database if not exists app_mall;

create external table if not exists app_mall.app_goods_sales_item(
goods_name string,
first_category_name string,
order_total bigint,
price_total decimal(10,2)
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/app/goods_sales_item/';


create external table if not exists app_mall.app_category_top10(
first_category_name string,
order_total bigint
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/app/category_top10/';

"
app_mall_add_partition_3.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/bin/bash
# 需求三:商品相关指标
# 每天凌晨执行一次

# 默认获取昨天的日期,也支持传参指定一个日期
if [ "z$1" = "z" ]
then
dt=`date +%Y%m%d --date="1 days ago"`
else
dt=$1
fi

hive -e "
insert overwrite table app_mall.app_goods_sales_item partition(dt='${dt}') select
goods_name,
first_category_name,
count(order_id) as order_total,
sum(goods_amount * order_curr_price) as price_total
from dws_mall.dws_order_goods_all_info
where dt = '${dt}'
group by goods_name,first_category_name;


insert overwrite table app_mall.app_category_top10 partition(dt='${dt}') select
first_category_name,
sum(order_total) as order_total
from app_mall.app_goods_sales_item
where dt ='${dt}'
group by first_category_name
order by order_total desc
limit 10;

"

image-20230406010738922

image-20230406010754961

image-20230406010814759

image-20230406010840827

image-20230406010927249

image-20230406011604503

1
2
3
问题:如果想要统计商品品类内最受用户喜欢的Top10商品,如何统计?
基于dws_order_goods_all_info表中的数据,根据商品品类进行分区,根据商品的订单总量
进行排序,获取每个品类中用户喜欢的Top10商品

需求四:漏斗分析

image-20230406011942433

image-20230330235704680

1
2
3
4
5
6
7
8
9
10
11
12
13
14
其实就是根据用户的行为一层一层分析用户的转化率。
活跃-->商品详情页-->下单-->支付
每一个指标对应的表:
活跃:dws_user_active_history(用户行为相关表)
商品详情页:dwd_good_item(用户行为相关表)
下单:dwd_user_order(商品订单相关表)
支付:dwd_user_order(商品订单相关表)
实现思路:
首先统计当天活跃用户数量
接着统计当天进入了多少个商品详情页
接着统计当天下单的数量
最后统计当天支付的数量
并且计算每一层的转化率。
最终把结果数据保存到表 app_user_conver_funnel中

app层

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
create external table if not exists app_mall.app_user_conver_funnel(
active_num int,
item_num int,
order_num int,
pay_num int,
active_to_item_ratio decimal(10,2),
item_to_order_ratio decimal(10,2),
order_to_pay_ratio decimal(10,2)
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/app/user_conver_funnel/';

insert overwrite table app_mall.app_user_conver_funnel partition(dt='20260201') select
duah.active_num,
dgi.item_num,
duo.order_num,
duo.pay_num,
dgi.item_num/duah.active_num as active_to_item_ratio,
duo.order_num/dgi.item_num as item_to_order_ratio,
duo.pay_num/duo.order_num as order_to_pay_ratio
from
(
select
count(*) as active_num
from dws_mall.dws_user_active_history
where dt = '20260201'
) as duah
join
(
select
count(distinct goods_id) as item_num
from dwd_mall.dwd_good_item
where dt = '20260201'
)as dgi
on 1=1
join
(
select
count(*) as order_num,
sum(case when order_status != 0 then 1 else 0 end) as pay_num
from dwd_mall.dwd_user_order
where dt = '20260201'
) as duo
on 1=1;
开发脚本
1
2
3
4
5
针对 app 层抽取脚本
1:表初始化脚本(初始化执行一次)
app_mall_init_table_4.sh
2:添加分区数据脚本(每天执行一次)
app_mall_add_partition_4.sh
app_mall_init_table_4.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#!/bin/bash
# 需求四:漏斗分析
# app层数据库和表初始化脚本,只需要执行一次即可

hive -e "
create database if not exists app_mall;

create external table if not exists app_mall.app_user_conver_funnel(
active_num int,
item_num int,
order_num int,
pay_num int,
active_to_item_ratio decimal(10,2),
item_to_order_ratio decimal(10,2),
order_to_pay_ratio decimal(10,2)
)partitioned by(dt string)
row format delimited
fields terminated by '\t'
location 'hdfs://bigdata01:9000/data/app/user_conver_funnel/';
"
app_mall_add_partition_4.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/bin/bash
# 需求四:漏斗分析
# 每天凌晨执行一次

# 默认获取昨天的日期,也支持传参指定一个日期
if [ "z$1" = "z" ]
then
dt=`date +%Y%m%d --date="1 days ago"`
else
dt=$1
fi

hive -e "
insert overwrite table app_mall.app_user_conver_funnel partition(dt='${dt}') select
duah.active_num,
dgi.item_num,
duo.order_num,
duo.pay_num,
dgi.item_num/duah.active_num as active_to_item_ratio,
duo.order_num/dgi.item_num as item_to_order_ratio,
duo.pay_num/duo.order_num as order_to_pay_ratio
from
(
select
count(*) as active_num
from dws_mall.dws_user_active_history
where dt = '${dt}'
) as duah
join
(
select
count(distinct goods_id) as item_num
from dwd_mall.dwd_good_item
where dt = '${dt}'
)as dgi
on 1=1
join
(
select
count(*) as order_num,
sum(case when order_status != 0 then 1 else 0 end) as pay_num
from dwd_mall.dwd_user_order
where dt = '${dt}'
) as duo
on 1=1;
"

image-20230406013300841

image-20230406013314309

image-20230406013432629


本文标题:大数据开发工程师-第十三周 综合项目:电商数据仓库之商品订单数仓

文章作者:TTYONG

发布时间:2023年04月05日 - 22:04

最后更新:2023年06月18日 - 16:06

原始链接:http://tianyong.fun/%E5%A4%A7%E6%95%B0%E6%8D%AE%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88-%E7%AC%AC%E5%8D%81%E4%B8%89%E5%91%A8-%E7%BB%BC%E5%90%88%E9%A1%B9%E7%9B%AE-%E7%94%B5%E5%95%86%E6%95%B0%E6%8D%AE%E4%BB%93%E5%BA%93%E4%B9%8B%E5%95%86%E5%93%81%E8%AE%A2%E5%8D%95%E6%95%B0%E4%BB%93.html

许可协议: 转载请保留原文链接及作者。

多少都是爱
0%