Bootstrap

Python数据可视化实战案例,星巴克以及自行车租凭相关真实数据、数据以及上传绑定、需要的自行下载。

# 导入必要的库
import matplotlib.pyplot as plt
import numpy as np

# 设置全局字体为SimHei以显示中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
# 设置全局允许负号显示
plt.rcParams['axes.unicode_minus'] = False

# 创建一个图形对象
fig = plt.figure()

# 添加第一个子图,121表示1行2列的第1个
ax1 = fig.add_subplot(121)

# 生成时间序列数据
t = np.arange(0.0, 5, 0.01)
# 根据时间序列生成正弦波形数据
s = np.sin(2*np.pi * t)
# 绘制正弦波形
ax1.plot(t, s, lw=2)  # lw=2表示线条宽度

# 设置注释框属性
bbox = dict(boxstyle= 'round', fc = 'white')
# 在图表上添加注释,指出局部最大值
plt.annotate('local max', xy = (2.3, 1), xytext=(3, 1.5))

# 设置箭头属性
arrowprops = dict(facecolor = 'black',edgecolor = 'red', headwidth = 7, width=2)
bbox_prop = dict(fc = 'white')

# 设置y轴标签和字体大小
ax1.set_ylabel('y', fontsize = 12)
# 设置x轴标签和字体大小
ax1.set_xlabel('x', fontsize = 12)
# 设置y轴范围
ax1.set_ylim(-2, 2)
# 在图表中添加文本说明最大值
ax1.text(1, 1.2, 'max', fontsize = 18)
# 在图表中添加公式文本说明,包括注释框、旋转角度和透明度
ax1.text(1.2, -1.8, '$y=sin(2*np.pi*t)$', bbox = bbox, rotation = 10,alpha = 0.8)

# 添加第二个子图,122表示1行2列的第2个
ax2 = fig.add_subplot(122)
# 生成x轴数据
x = np.linspace(0, 10, 200)
# 根据x轴数据生成正弦波形数据
y = np.sin(x)
# 绘制正弦波形,使用虚线并设置颜色为紫色
ax2.plot(x, y,linestyle = '-.', color = 'purple')
# 在图表上添加注释,使用文本和箭头,包括箭头样式、颜色和注释框属性
ax2.annotate(text = 'Here I am', xy = (4.8, np.sin(4.8)), xytext=(3.7, -0.2),weight='bold',color='k',
             arrowprops=dict(arrowstyle = '-|>', connectionstyle = 'arc3', color = 'red'),
             bbox = dict(boxstyle = 'round, pad=0.5',fc = 'yellow',ec ='k',lw = 1, alpha = 0.8))
# 设置y轴范围
ax2.set_ylim(-1.5, 1.5)
# 设置x轴范围
ax2.set_xlim(0, 10)
# 设置文本注释框属性
bbox = dict(boxstyle = 'round', ec = 'red', fc = 'white')
# 在图表中添加公式文本说明,包括注释框属性
ax2.text(6, -1.9, '$y=sin(x)$',bbox = dict(boxstyle = 'square', facecolor = 'white', ec = 'black'))
# 打开网格,设置网格样式、颜色和透明度
ax2.grid(ls = ':', color = 'gray', alpha = 0.5)
# 在图表中添加文本,包括字体大小、透明度、颜色和注释框属性
ax2.text(4.5, 1, '老李制作', fontsize = 15, alpha = 0.3, color = 'gray', bbox = dict(fc = 'white',boxstyle = 'round',edgecolor = 'gray', alpha = 0.3))

# 显示图形
plt.show()

在这里插入图片描述

实战案例开始
# 1 导入模块
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False


# 2 获取数据
starbucks = pd.read_csv('directory.csv')
starbucks.head()
BrandStore NumberStore NameOwnership TypeStreet AddressCityState/ProvinceCountryPostcodePhone NumberTimezoneLongitudeLatitude
0Starbucks47370-257954Meritxell, 96LicensedAv. Meritxell, 96Andorra la Vella7ADAD500376818720GMT+1:00 Europe/Andorra1.5342.51
1Starbucks22331-212325Ajman Drive ThruLicensed1 Street 69, Al JarfAjmanAJAENaNNaNGMT+04:00 Asia/Dubai55.4725.42
2Starbucks47089-256771Dana MallLicensedSheikh Khalifa Bin Zayed St.AjmanAJAENaNNaNGMT+04:00 Asia/Dubai55.4725.39
3Starbucks22126-218024Twofour 54LicensedAl Salam StreetAbu DhabiAZAENaNNaNGMT+04:00 Asia/Dubai54.3824.48
4Starbucks17127-178586Al Ain TowerLicensedKhaldiya Area, Abu Dhabi IslandAbu DhabiAZAENaNNaNGMT+04:00 Asia/Dubai54.5424.51
# 3 数据分析及可视化
# 查看星巴克旗下品牌有哪些。
print('星巴克旗下品牌有:\n', starbucks.Brand.value_counts())
coffee = starbucks[starbucks.Brand == 'Starbucks']
print("\n", coffee.shape)
星巴克旗下品牌有:
 Brand
Starbucks                25249
Teavana                    348
Evolution Fresh              2
Coffee House Holdings        1
Name: count, dtype: int64

 (25249, 13)
# 查看全世界一共有多少个国家和地区开设了星巴克
df = starbucks.groupby(["Country"]).size()
print('全世界一共有多少个国家和地区开设了星巴克门店:', df.size)
df1 = df.sort_values(ascending= False)
print('排名前10的国家和地区:\n', df1.head(10))
全世界一共有多少个国家和地区开设了星巴克门店: 73
排名前10的国家和地区:
 Country
US    13608
CN     2734
CA     1468
JP     1237
KR      993
GB      901
MX      579
TW      394
TR      326
PH      298
dtype: int64
# 星巴克门店数排名后10的国家
print('排名后10的国家:\n', df1.tail(10))
排名后10的国家:
 Country
AZ    4
KH    4
TT    3
AW    3
CW    3
SK    3
ZA    3
LU    2
MC    2
AD    1
dtype: int64
# 柱状图绘制排名前10的分布情况
plt.rcParams['font.size'] = 15
plt.rcParams['font.family'] = 'SimHei'
df1.head(10).plot(kind = 'bar',rot = 0)
plt.title('星巴克门店数排名前10的国家和地区')
plt.ylabel('Store Counts')
plt.xlabel('Countries and Regions')
Text(0.5, 0, 'Countries and Regions')

在这里插入图片描述

# 显示拥有星巴克门店数量排名前10的城市
count_starbucks_city = coffee.City.value_counts()
print('星巴克门店数量排名前10的城市:\n', count_starbucks_city.head(10))
star = starbucks.dropna(how = 'any', subset= ['City'])
count_starbucks_city = star.City.value_counts()
print('全世界星巴克门店数量排名前10的城市:\n', count_starbucks_city(10))
星巴克门店数量排名前10的城市:
 City
上海市            542
Seoul          243
北京市            234
New York       230
London         215
Toronto        186
Mexico City    180
Chicago        179
Las Vegas      153
Seattle        151
Name: count, dtype: int64



---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

Cell In[24], line 6
      4 star = starbucks.dropna(how = 'any', subset= ['City'])
      5 count_starbucks_city = star.City.value_counts()
----> 6 print('全世界星巴克门店数量排名前10的城市:\n', count_starbucks_city(10))


TypeError: 'Series' object is not callable
# 绘制柱状图
plt.figure(1, figsize= (8, 6))
count_starbucks_city = star.City.value_counts()
city_top10 = count_starbucks_city.head()
city_top10.plot(kind = 'bar', rot = 30)
plt.title('拥有星巴克门店最多的10个城市')
plt.ylabel('Store Counts')
plt.xlabel('City')
Text(0.5, 0, 'City')

在这里插入图片描述

import pinyin
# 统计中国排名前10 的城市
df = star[star['Country'] == 'CN']
df1 = df.copy()
# 将城市名改为小写
df1['City'] = df1['City'].apply(lambda x:x.lower())
# 将汉字城市名改为小写拼音,去掉“市"的拼音
df1['City'] = df1['City'].apply(
    lambda x:pinyin.get(x, format = 'strip', delimiter = '')[0:-3])
# 统计每个城市的星巴克门店数量
df1 = df1.groupby(['City']).size().sort_values(ascending=False)
df1.head(10)
City
shanghai     542
beijing      234
hangzhou     117
shenzhen     113
guangzhou    106
hong k       104
chengdu       98
suzhou        90
nanjing       73
wuhan         67
dtype: int64
# 绘制柱状图
df1.head(10).plot(kind = 'bar', rot = 30)
plt.title('中国拥有星巴克门店最多的10个城市')
plt.ylabel('Store Counts')
plt.xlabel('Cities')

Text(0.5, 0, 'Cities')

在这里插入图片描述

# 用饼图显示星巴克门店的经营方式有哪几种
plt.figure(1, figsize=(8, 6))
ownership = star['Ownership Type'].value_counts()
plt.title('星巴克门店所有权类型')
ownership.plot(kind = 'pie')
plt.show()
plt.pie(ownership.values, autopct='%.1f%%', labels=ownership.index)
plt.show()

在这里插入图片描述

# 导入模块
import numpy as np
import pandas as pd
import  datetime
import matplotlib.pyplot as plt
import seaborn as sns

# 获取数据,导入待处理数据bike.csv,并显示前5行
bike = pd.read_csv('bike.csv')
bike.head()
datetimeseasonholidayworkingdayweathertempatemphumiditywindspeedcasualregisteredcount
02011-01-01 00:00:0010019.8414.395810.031316
12011-01-01 01:00:0010019.0213.635800.083240
22011-01-01 02:00:0010019.0213.635800.052732
32011-01-01 03:00:0010019.8414.395750.031013
42011-01-01 04:00:0010019.8414.395750.0011
# 分析数据,查看待处理数据的类型
bike.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB
# 将字段datetime的类型转换为日期时间
bike.datetime = pd.to_datetime(bike.datetime)
bike.dtypes
datetime      datetime64[ns]
season                 int64
holiday                int64
workingday             int64
weather                int64
temp                 float64
atemp                float64
humidity               int64
windspeed            float64
casual                 int64
registered             int64
count                  int64
dtype: object
# 将datetime 设置为索引, 并从租凭数值差异等着手观察他们的密度分布
bike = bike.set_index('datetime')
sns.displot(bike['count'])
<seaborn.axisgrid.FacetGrid at 0x188cdad03d0>

在这里插入图片描述

# 显示count字段的描述信息
bike["count"].describe()
count    10886.000000
mean       191.574132
std        181.144454
min          1.000000
25%         42.000000
50%        145.000000
75%        284.000000
max        977.000000
Name: count, dtype: float64
# 将count列中小于第一四分位数的数据删除,并绘制对应的密度图
def Count(x):
    if x < 145:
        return np.nan
    else: 
        return x
bike1 = bike
bike1['count'] = bike1['count'].apply(Count)
bike1 = bike1.dropna(axis= 0,how = 'any')
sns.displot(bike1['count'])
<seaborn.axisgrid.FacetGrid at 0x188d2f1f490>

在这里插入图片描述

# 按年份统计自行车租赁数的均值
bike = bike1
y_bike = bike.groupby(bike.index.year).mean()['count']
y_bike
datetime
2011    274.526697
2012    366.408629
Name: count, dtype: float64
# 绘制年统计自行车租赁数均值的直方图
y_bike.plot(kind = 'bar', rot = 0)
<Axes: xlabel='datetime'>

在这里插入图片描述

# 重采样,按月进行分析汇总
mm_bike = bike.resample('M', kind = 'period').mean()
mm_bike.head(10)
C:\Users\admin.DESKTOP-G6CFGT8\AppData\Local\Temp\ipykernel_3984\4111023337.py:2: FutureWarning: The 'kind' keyword in DataFrame.resample is deprecated and will be removed in a future version. Explicitly cast the index to the desired type instead
  mm_bike = bike.resample('M', kind = 'period').mean()
C:\Users\admin.DESKTOP-G6CFGT8\AppData\Local\Temp\ipykernel_3984\4111023337.py:2: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  mm_bike = bike.resample('M', kind = 'period').mean()
seasonholidayworkingdayweathertempatemphumiditywindspeedcasualregisteredcount
datetime
2011-011.00.0000001.0000001.1600008.69200010.90960049.32000011.8804405.280000175.520000180.800000
2011-021.00.0000000.7910451.28358214.29492517.24313444.17910418.17910023.835821168.208955192.044776
2011-031.00.0000000.6666671.29166716.55375019.72802149.45833318.18777848.583333163.781250212.364583
2011-042.00.0780140.6170211.45390119.97078023.63475255.17730516.89374160.624113177.539007238.163121
2011-052.00.0000000.7581971.44672123.06082027.21403764.06967213.94662755.745902224.110656279.856557
2011-062.00.0000000.6925931.32222229.41066733.31963051.77407412.84457064.659259229.333333293.992593
2011-073.00.0526320.6165411.16165431.30488735.26462453.64661713.53780386.962406218.849624305.812030
2011-083.00.0000000.8078431.29019630.97349034.79527555.38823514.29819459.368627222.450980281.819608
2011-093.00.0391300.6130431.50869626.28278330.11263067.60434811.98289269.530435220.782609290.313043
2011-104.00.0625000.6294641.41964322.49142926.52879562.70982111.60766066.584821227.669643294.254464
# 按月统计数据的绘图
mm_bike.plot()
plt.legend(loc = 'best', fontsize = 0)
<matplotlib.legend.Legend at 0x188d2d0cdd0>

在这里插入图片描述

# 绘制观察哪个月自行车的租赁数目最大
m_bike = bike.groupby(bike.index.month).mean()['count']
m_bike.plot()
plt.grid()

在这里插入图片描述

# 分析每天不同时间自行车租赁数量变化
h_bike = bike.groupby(bike.index.hour).mean()['count']
h_bike.plot(kind ='bar', rot = 0)
<Axes: xlabel='datetime'>

在这里插入图片描述

# 分析天气对租赁数额的影响
weather_bike = bike.groupby(bike.weather).mean()['count']
weather_bike.plot(kind = 'bar', rot = 0)
<Axes: xlabel='weather'>

在这里插入图片描述

;