首发于数据分析
python数据分析实例(五) 淘宝双11化妆品数据

python数据分析实例(五) 淘宝双11化妆品数据

bokeh柱状图 、折线图、浮动散点图、散点图、堆叠图,matplotlib饼图,常用函数:

data_S = df_S[['id','title','店名','date','price']]
print('数据时间周期为:\n', data_S['date'].unique())
data_S['period'] = pd.cut(data_S['date'],[4,10,11,14],labels = ['双十一前','双十一当天','双十一后'],right=True)
price_SS = data_S[['id','price','period','店名']].groupby(['id','period']).min()
price_SS.reset_index(inplace = True)
price_SS_11top = price_SS[price_SS['period']=='双十一前']
price_SS_11middle = price_SS[price_SS['period']=='双十一当天']
price_SS_11middle_top = pd.merge(price_SS_11top,price_SS_11middle,left_on='id',right_on = 'id')
price_SS_11middle_top['zkl'] = price_SS_11middle_top['price_y']/price_SS_11middle_top['price_x']
price11middle_SS_top_100 = price_SS_11middle_top[price_SS_11middle_top['zkl']<1]
price11middle_SS_top_100.sort_values(by = 'zkl',inplace = True,ascending = False)
n2 = len(price11middle_SS_top_100['id'].unique())
brands = price_SS_95['店名_y'].dropna().unique().tolist()
price11middle_SSS_top_100['zkl_range'] = pd.cut(price11middle_SSS_top_100['zkl'],bins = np.linspace(0,1,21))
price11middle_SSS_top_95 = price11middle_SSS_top_100.groupby('zkl_range').count().iloc[:-1] # 这里去掉折扣率在0.95-1之间的数据,该区间内数据zkl大部分为1,不打折
price11middle_SSS_top_95['zkl_pre'] = price11middle_SSS_top_95['zkl']/price11middle_SSS_top_95['zkl'].sum()
price11middle_SSS_top_95 = price11middle_SSS_top_95.reset_index()
price11middle_SSS_top_95['zkl_range'] = price11middle_SSS_top_95['zkl_range'].astype(str)
lst_brand = price11middle_SSS_top_95['zkl_range'].tolist()
一, 双11降价商品占比
二, x=各店铺 , y=打折商品数 bokeh柱状图
三, x=打折率period , y=商品总数 bokeh折线图
四, x=打折率period, y=各商品(各店铺) bokeh浮动散点图
五, x=参与打折商品比例, y=商品平均折扣率 z= 店铺商品总数 bokeh散点图
六, 7分类 matplotlib饼图
七, 未参与双十一当天活动的商品分类数量
八, x=各店铺 , y=参与活动商品数 bokeh堆叠图


一, 双11降价商品占比

①加载模块,导入数据,数据去空值,提取销售日期

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore') 
# 不发出警告

import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['KaiTi']
mpl.rcParams['font.serif'] = ['KaiTi']
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题,或者转换负号为字符串,中文不显

import os
os.chdir('C:\\Users\\Administrator\\Desktop\\')
##创建工作路径
df = pd.read_excel('双十一淘宝美妆数据.xlsx',sheetname=0,header=0,index_col=0)

df_S = df.copy()
def data_cleaning(df):
    cols = df.columns
    for col in cols:
        if df[col].dtype ==  'object':
            df[col].fillna('缺失数据', inplace = True)
        else:
            df[col].fillna(0, inplace = True)
    return(df)

##df.fillna(0,inplace = True)   # 填充缺失值
df_S = data_cleaning(df_S)

df_S['date'] = df_S.index.day     # 提取销售日期

②以'id','period' 分组聚合,一维转二维

data_S = df_S[['id','title','店名','date','price']]
print('数据时间周期为:\n', data_S['date'].unique())
data_S['period'] = pd.cut(data_S['date'],[4,10,11,14],labels = ['双十一前','双十一当天','双十一后'],right=True)

price_SS = data_S[['id','price','period','店名']].groupby(['id','period']).min()
price_SS.reset_index(inplace = True)

price_SS_11top = price_SS[price_SS['period']=='双十一前']
price_SS_11middle = price_SS[price_SS['period']=='双十一当天']
price_SS_11middle_top = pd.merge(price_SS_11top,price_SS_11middle,left_on='id',right_on = 'id')

price_SS_11middle_top['zkl'] = price_SS_11middle_top['price_y']/price_SS_11middle_top['price_x']

price11middle_SS_top_100 = price_SS_11middle_top[price_SS_11middle_top['zkl']<1]

n2 = len(price11middle_SS_top_100['id'].unique())

print('真打折的商品数量约占比%.2f%%' % (n2/(len(df_S['id'].unique()))*100))

二, x=各店铺 ,y=打折商品数 bokeh柱状图

#result_SS_dpdzs = price_SS_11middle.dropna()
#result_SS_dpdzs =  result_SS_dpdzs[['店名','id']].groupby('店名').count()
#result_SS_dpdzs.sort_values(by = 'id',inplace = True,ascending = False)


result_SS_dpdzs =  price11middle_SS_top_100[['店名_y','id']].groupby('店名_y').count()
result_SS_dpdzs.sort_values(by = 'id',inplace = True,ascending = False)



from bokeh.plotting import figure,show,output_file
from bokeh.models import ColumnDataSource

from bokeh.models import HoverTool
from bokeh.core.properties import value

# 设置好参数
lst_brand = result_SS_dpdzs.index.tolist()
colors = ["#718dbf" ,"#e84d60"]

 
 
# 修改数据index和columns名字为英文
result_SS_dpdzs.index.name = 'brand'
result_SS_dpdzs.columns = ['sale_on_11_num']
 

source = ColumnDataSource(data=result_SS_dpdzs)

# 设置标签显示内容
hover = HoverTool(tooltips=[("品牌", "@brand"),
                            ("双十一当天参与活动的商品数量", "@sale_on_11_num")
                           ])  

# 构建绘图空间
p = figure(x_range=lst_brand, plot_width=900, plot_height=350, title="各个品牌参与双十一活动的商品数量分布",
          tools=[hover,'reset,xwheel_zoom,pan,crosshair'])

p.vbar(top = 'sale_on_11_num',          # 设置堆叠值,这里source中包含了不同年份的值,years变量用于识别不同堆叠层
             x='brand',     # 设置x坐标
             source=source,
             width=0.9, 
#             color=colors, 
             alpha = 0.8,
#             legend=[value(x) for x in lst_type],
             muted_color='black', muted_alpha=0.2
             )
show(p)

三, x=打折率period , y=商品总数 bokeh折线图

以'zkl'分组聚合

price11middle_SS_top_100['zkl_range'] = pd.cut(price11middle_SS_top_100['zkl'],bins = np.linspace(0,1,21))
price11middle_SSS_top_95 = price11middle_SS_top_100.groupby('zkl_range').count().iloc[:-1] # 这里去掉折扣率在0.95-1之间的数据,该区间内数据zkl大部分为1,不打折
price11middle_SSS_top_95['zkl_pre'] = price11middle_SSS_top_95['zkl']/price11middle_SSS_top_95['zkl'].sum()
price11middle_SSS_top_95 = price11middle_SSS_top_95.reset_index()
price11middle_SSS_top_95['zkl_range'] = price11middle_SSS_top_95['zkl_range'].astype(str)
lst_brand = price11middle_SSS_top_95['zkl_range'].tolist()


from bokeh.plotting import figure,show,output_file
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.core.properties import value
output_file("line33.html")

source = ColumnDataSource(data=price11middle_SSS_top_95)
### 创建数据

hover = HoverTool(tooltips=[("折扣率", "@zkl")])  # 设置标签显示内容
p = figure(
          x_range=lst_brand, 
           plot_width=1200, plot_height=350, title="商品折扣率统计",
          tools=[hover,'reset,xwheel_zoom,pan,crosshair']
          )
## 构建绘图空间

p.line(x='zkl_range',y='zkl_pre',source = source,     # 设置x,y值, source → 数据源
       line_width=2, line_alpha = 0.8, line_color = 'black',line_dash = [10,4])   # 线型基本设置
# 绘制折线图
p.circle(x='zkl_range',y='zkl_pre',source = source, size = 8,color = 'red',alpha = 0.8)

p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None
# 设置其他参数

show(p)

四, x=打折率period,y=各商品(各店铺) bokeh浮动散点图

from bokeh.transform import jitter
price_SS_95 = price_SS_11middle_top[price_SS_11middle_top['zkl']<0.95]
brands = price_SS_95['店名_y'].dropna().unique().tolist()
# 得到y坐标

source = ColumnDataSource(data = price_SS_95)
# 创建数据
output_file("line44.html")
hover = HoverTool(tooltips=[("折扣率", "@zkl")])  # 设置标签显示内容

p = figure(plot_width=800, plot_height=600,y_range=brands,title="不同品牌折扣率情况",
          tools=[hover,'reset,ywheel_zoom,pan,crosshair'])

p.circle(x='zkl', 
         y=jitter('店名_y', width=0.6, range=p.y_range),
         source=source, alpha=0.3)
# jitter参数 → 'day':第一参数,这里指y的值;  width:间隔宽度比例;  range:分类范围对象,这里和y轴的分类一致

p.ygrid.grid_line_color = None
# 设置其他参数

show(p)

五, x=参与打折商品比例, y=商品平均折扣率 z= 店铺商品总数 bokeh散点图

data_SS_95dzl = price_SS_11middle_top[price_SS_11middle_top['zkl']<0.95]  # 删除未打折数据
result_SS_d95zkl = data_SS_95dzl.groupby('店名_y')['zkl'].mean()




result_SS_95num = data_SS_95dzl['店名_y'].value_counts()
result_SS_1000num = price_SS_11middle_top['店名_y'].value_counts()


result_SS_dzbdzl = pd.DataFrame({'打折商品数':result_SS_95num,'商品总数':result_SS_1000num,'商品平均折扣率':result_SS_d95zkl})
result_SS_dzbdzl['参与打折商品比例'] = result_SS_dzbdzl['打折商品数'] / result_SS_dzbdzl['商品总数']
result_SS_dzbdzl.dropna(inplace = True)


# 用bokeh绘制散点图,x轴为参与打折商品比例,y轴为折扣力度,点的大小代表该品牌参加双11活动的商品总数
from bokeh.plotting import figure,show,output_file
from bokeh.models import ColumnDataSource
from bokeh.models.annotations import Span            # 导入Span模块
from bokeh.models.annotations import Label           # 导入Label模块
from bokeh.models.annotations import BoxAnnotation   # 导入BoxAnnotation模块
from bokeh.models import HoverTool
#
#
#
#
bokeh_data = result_SS_dzbdzl[['商品平均折扣率','商品总数','参与打折商品比例']]
bokeh_data.index.name = 'index'
bokeh_data.columns = ['zkl','amount','pre']
bokeh_data['size'] = bokeh_data['amount'] * 0.3
source = ColumnDataSource(bokeh_data)
## 创建ColumnDataSource数据
#


hover = HoverTool(tooltips=[("品牌", "@index"),
                            ("折扣率", "@zkl"),
                            ("商品总数", "@amount"),
                            ("参与打折商品比例", "@pre"),
                           ])  # 设置标签显示内容
p = figure(plot_width=600, plot_height=600,
                title="各个品牌打折套路解析" , 
                tools=[hover,'box_select,reset,wheel_zoom,pan,crosshair']) 
# 构建绘图空间

p.circle_x(x = 'pre',y = 'zkl',source = source,size = 'size',
           fill_color = 'red',line_color = 'black',fill_alpha = 0.6,line_dash = [8,3])
p.ygrid.grid_line_dash = [6, 4]
p.xgrid.grid_line_dash = [6, 4]
# 散点图


x_mean = bokeh_data['pre'].mean()
y_mean = bokeh_data['zkl'].mean()
x = Span(location=x_mean, dimension='height', line_color='green',line_alpha = 0.7, line_width=1.5, line_dash = [6,4])
y = Span(location=y_mean, dimension='width', line_color='green',line_alpha = 0.7, line_width=1.5, line_dash = [6,4])
p.add_layout(x)
p.add_layout(y)
### 绘制辅助线
##
bg1 = BoxAnnotation(bottom=y_mean, right=x_mean,fill_alpha=0.1, fill_color='olive')
label1 = Label(x=0.1, y=0.55,text="少量大打折",text_font_size="10pt" )
p.add_layout(bg1)
p.add_layout(label1)
# 绘制第一象限

bg2 = BoxAnnotation(bottom=y_mean, left=x_mean,fill_alpha=0.1, fill_color='firebrick')
label2 = Label(x=0.7, y=0.55,text="大量大打折",text_font_size="10pt" )
p.add_layout(bg2)
p.add_layout(label2)
# 绘制第二象限

bg3 = BoxAnnotation(top=y_mean, right=x_mean,fill_alpha=0.1, fill_color='firebrick')
label3 = Label(x=0.1, y=0.80,text="少量少打折",text_font_size="10pt" )
p.add_layout(bg3)
p.add_layout(label3)
# 绘制第三象限

bg4 = BoxAnnotation(top=y_mean, left=x_mean,fill_alpha=0.1, fill_color='olive')
label4 = Label(x=0.7, y=0.80,text="少量少打折",text_font_size="10pt" )
p.add_layout(bg4)
p.add_layout(label4)
# 绘制第四象限
#
show(p)

六, 7分类matplotlib饼图

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore') 
# 不发出警告


from bokeh.plotting import figure,show,output_file
from bokeh.models import ColumnDataSource
# 导入图表绘制、图标展示模块
# 导入ColumnDataSource模块

import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['KaiTi']
mpl.rcParams['font.serif'] = ['KaiTi']
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题,或者转换负号为字符串,中文不显






import os
os.chdir('C:\\Users\\Administrator\\Desktop\\')
##创建工作路径
df = pd.read_excel('双十一淘宝美妆数据.xlsx',sheetname=0,header=0,index_col=0)






def data_cleaning(df):
    cols = df.columns
    for col in cols:
        if df[col].dtype ==  'object':
            df[col].fillna('缺失数据', inplace = True)
        else:
            df[col].fillna(0, inplace = True)
    return(df)
# 该函数可以将任意数据内空值替换


df = data_cleaning(df)
df['date'] = df.index.day     # 提取销售日期
print('数据时间周期为:\n', df.index.unique())


# 双十一当天在售的商品占比情况
# 按照商品销售节奏分类,我们可以将商品分为7类
#   A. 11.11前后及当天都在售 → 一直在售
#   B. 11.11之后停止销售 → 双十一后停止销售
#   C. 11.11开始销售并当天不停止 → 双十一当天上架并持续在售
#   D. 11.11开始销售且当天停止 → 仅双十一当天有售
#   E. 11.5 - 11.10 → 双十一前停止销售
#   F. 仅11.11当天停止销售 → 仅双十一当天停止销售
#   G. 11.12开始销售 → 双十一后上架


## 筛选数据
data1 = df[['id','title','店名','date']]
##
##
##### 统计不同商品的销售开始日期、截止日期
d1 = data1[['id','date']].groupby(by = 'id').agg(['min','max'])['date']  
###
##### 筛选双十一当天售卖的商品id
id_11 = data1[data1['date']==11]['id'].unique()
d2 = pd.DataFrame({'id':id_11,'双十一当天是否售卖':True})
###
### 合并数据
id_date = pd.merge(d1,d2,left_index=True,right_on='id',how = 'left')
id_date['双十一当天是否售卖'][id_date['双十一当天是否售卖']!=True] = False
##
##
# 统计
m = len(data1['id'].unique())
m_11 = len(id_11)
m_11_pre = m_11/m
print('商品总数为%i个\n-------' % m)
print('双十一当天参与活动的商品总数为%i个,占比为%.2f%%\n-------' % (m_11,m_11_pre*100))
print('品牌总数为%i个\n' % len(data1['店名'].unique()),data1['店名'].unique())
#
## 商品销售节奏分类
id_date['type'] = '待分类'
id_date['type'][(id_date['双十一当天是否售卖']== True)&(id_date['min'] <11)&(id_date['max']>11)] = 'A'      #  A类:11.11前后及当天都在售 → 一直在售
id_date['type'][(id_date['双十一当天是否售卖']== True)&(id_date['min'] <11)&(id_date['max']==11)] = 'B'     #  B类:11.11之后停止销售 → 双十一后停止销售
id_date['type'][(id_date['双十一当天是否售卖']== True)&(id_date['min'] ==11)&(id_date['max']>11)] = 'C'     #  C类:11.11开始销售并当天不停止 → 双十一当天上架并持续在售
id_date['type'][(id_date['双十一当天是否售卖']== True)&(id_date['min'] ==11)&(id_date['max']==11)] = 'D'    #  D类:11.11开始销售且当天停止 → 仅双十一当天有售
id_date['type'][id_date['双十一当天是否售卖']== False] = 'F'         #  F类:仅11.11当天停止销售 → 仅双十一当天停止销售
id_date['type'][(id_date['双十一当天是否售卖']== False)&(id_date['max']<11)] = 'E'                             #  E类:11.5 - 11.10 → 双十一前停止销售
id_date['type'][(id_date['双十一当天是否售卖']== False)&(id_date['min'] >11)] = 'G'                            #  G类:11.11之后开始销售 → 双十一后上架

## 计算不同类别的商品数量
result1 = id_date['type'].value_counts()
result1 = result1.loc[['A','C','B','D','E','F','G']]  # 调整顺序
#
## 设置调色盘
from bokeh.palettes import brewer
colori = brewer['YlGn'][7]
#
## 绘制饼图
plt.axis('equal')  # 保证长宽相等
plt.pie(result1,labels = result1.index, autopct='%.2f%%',pctdistance=0.8,labeldistance =1.1,
        startangle=90, radius=1.5,counterclock=False, colors = colori)

七,未参与双十一当天活动的商品分类数量

# 未参与双十一当天活动的商品,在双十一之后的去向如何?
#   con1 → 暂时下架(F)
#   con2 → 重新上架(E中部分数据,数据中同一个id可能有不同title,“换个马甲重新上架”)
#   con3 → 预售(E中部分数据,预售商品的title中包含“预售”二字),字符串查找特定字符 dataframe.str.contains('预售')
#   con4 → 彻底下架(E中部分数据),可忽略

# 找到未参与双十一当天活动的商品id
id_not11 = id_date[id_date['双十一当天是否售卖']==False]  # 筛选出双十一当天没参加活动的产品id
print('双十一当天没参加活动的商品总数为%i个,占比为%.2f%%\n-------' % (len(id_not11),len(id_not11)/m*100))
print('双十一当天没参加活动的商品销售节奏类别为:\n',id_not11['type'].value_counts().index.tolist())
print('------')

# 筛选出未参与双十一当天活动商品id对应的原始数据
df_not11 = id_not11[['id','type']]
data_not11 = pd.merge(df_not11,df,on = 'id', how = 'left')
#
#
# 筛选出con1的商品id
# con1 → 暂时下架(F)
id_con1 = id_date['id'][id_date['type'] == 'F'].values

# 筛选出con2的商品id
# con2 → 重新上架(E中部分数据,数据中同一个id可能有不同title,“换个马甲重新上架”)
data_con2 = data_not11[['id','title','date']].groupby(by = ['id','title']).count()   # 按照id和title分组(找到id和title一对多的情况)
title_count = data_con2.reset_index()['id'].value_counts()   # 计算id出现的次数,如果出现次数大于1,则说明该商品是更改了title的
id_con2 = title_count[title_count>1].index

# 筛选出con3的商品id
# con3 → 预售(E中部分数据,预售商品的title中包含“预售”二字)
data_con3 = data_not11[data_not11['title'].str.contains('预售')]   # 筛选出title中含有“预售”二字的数据
id_con3 = data_con3['id'].value_counts().index     

##
print("未参与双十一当天活动的商品中:\n暂时下架商品的数量为%i个,重新上架商品的数据量为%i个,预售商品的数据量为%i个" 
      % (len(id_con1), len(id_con2), len(id_con3)))

z = data_con3['id'].unique()

八, x=各店铺 , y=参与活动商品数 bokeh堆叠图

from bokeh.models import HoverTool
from bokeh.plotting import figure,show,output_file
from bokeh.models import ColumnDataSource
from bokeh.core.properties import value
from bokeh.palettes import Spectral

lst_brand = result2_data.index.tolist()
lst_type = result2_data.columns.tolist()[:2]
#colors = ["#718dbf" ,"#e84d60"]
# 设置好参数

result2_data.index.name = 'brand'
result2_data.columns = ['sale_on_11','presell','sum']
# 修改数据index和columns名字为英文

source = ColumnDataSource(data=result2_data)
# 创建数据

hover = HoverTool(tooltips=[("品牌", "@brand"),
                            ("双十一当天参与活动的商品数量", "@sale_on_11"),
                            ("预售商品数量", "@presell"),
                            ("参与双十一活动商品总数", "@sum")
                           ])  # 设置标签显示内容

p = figure(x_range=lst_brand, plot_width=900, plot_height=350, title="各个品牌参与双十一活动的商品数量分布",
          tools=[hover,'reset,xwheel_zoom,pan,crosshair'])
# 构建绘图空间

p.vbar_stack(lst_type,          # 设置堆叠值,这里source中包含了不同年份的值,years变量用于识别不同堆叠层
             x='brand',     # 设置x坐标
             source=source,
             width=0.9, 
#             color=colors,
             color = Spectral[11][:2],
             alpha = 0.8,legend=[value(x) for x in lst_type],
             muted_color='black', muted_alpha=0.2
             )
# 绘制堆叠图

p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None
p.legend.location = "top_right"
p.legend.orientation = "horizontal"
p.legend.click_policy="mute"
# 设置其他参数

show(p)

编辑于 2019-08-09

文章被以下专栏收录