初始化
import pandas as pd
import numpy as np
# 文件目录,绝对路径
INPUT_PATH = 'G:\DCIC\Data\\'
# 文件读取行数
MAX_ROWS = 100000
数据读取
巡游车GPS数据读取
taxigps2019 = pd.read_csv(INPUT_PATH + 'taxiGps20190531.csv', nrows=MAX_ROWS)
# 查看数据的统计描述
taxigps2019.describe()
OPERATING_STATUS | GPS_SPEED | DRIVING_DIRECTION | LONGITUDE | LATITUDE | |
---|---|---|---|---|---|
count | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 |
mean | 2.415830 | 15.922179 | 161.451510 | 117.388134 | 24.354396 |
std | 2.320472 | 22.837529 | 113.725946 | 9.273066 | 1.932351 |
min | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 1.000000 | 0.000000 | 59.000000 | 118.100982 | 24.480172 |
50% | 1.000000 | 0.000000 | 167.000000 | 118.123175 | 24.493398 |
75% | 6.000000 | 28.900000 | 261.000000 | 118.149498 | 24.516875 |
max | 8.000000 | 381.300000 | 360.000000 | 129.110960 | 34.656481 |
taxigps2019.head()
OPERATING_STATUS | GPS_SPEED | DRIVING_DIRECTION | GPS_TIME | LONGITUDE | LATITUDE | CARNO | |
---|---|---|---|---|---|---|---|
0 | 1 | 0.0 | 0 | 2019/5/31 0:00:05 | 118.098451 | 24.493498 | c3e622b913ca7085db129fa379121b90 |
1 | 1 | 0.0 | 4 | 2019/5/31 0:00:08 | 118.055053 | 24.564395 | 9b4802bcb6344a5772814557428aee85 |
2 | 1 | 0.0 | 0 | 2019/5/31 0:00:09 | 0.000000 | 0.000000 | f7e16e0e76abf3460007f65cb1509565 |
3 | 1 | 0.0 | 0 | 2019/5/31 0:00:10 | 0.000000 | 0.000000 | c607b7a160ab91bf05f8085ed221196f |
4 | 1 | 0.8 | 32 | 2019/5/31 0:00:12 | 117.990452 | 24.569882 | 7be62de4645772356a3bb7190bc54c6d |
taxigps2019.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 OPERATING_STATUS 100000 non-null int64
1 GPS_SPEED 100000 non-null float64
2 DRIVING_DIRECTION 100000 non-null int64
3 GPS_TIME 100000 non-null object
4 LONGITUDE 100000 non-null float64
5 LATITUDE 100000 non-null float64
6 CARNO 100000 non-null object
dtypes: float64(3), int64(2), object(2)
memory usage: 5.3+ MB
- 将字段类型根据取值空间进行修改,可以压缩内存使用需求
taxigps2019 = pd.read_csv(INPUT_PATH + 'taxiGps20190531.csv', nrows=MAX_ROWS,
dtype = {
'DRIVING_DIRECTION': np.uint16,
'OPERATING_STATUS': np.uint8,
'LONGITUDE': np.float32,
'LATITUDE': np.float32,
'GPS_SPEED': np.float16
})
taxigps2019.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 OPERATING_STATUS 100000 non-null uint8
1 GPS_SPEED 100000 non-null float16
2 DRIVING_DIRECTION 100000 non-null uint16
3 GPS_TIME 100000 non-null object
4 LONGITUDE 100000 non-null float32
5 LATITUDE 100000 non-null float32
6 CARNO 100000 non-null object
dtypes: float16(1), float32(2), object(2), uint16(1), uint8(1)
memory usage: 2.8+ MB
对GPS数据进行排序
taxigps2019 = taxigps2019[taxigps2019.columns[::-1]] # 把列的顺序反过来
taxigps2019.sort_values(by=['CARNO','GPS_TIME'], inplace=True) # 根据车号以及GPS_TIME进行排序
taxigps2019.reset_index(inplace=True, drop=True)
taxigps2019.head()
CARNO | LATITUDE | LONGITUDE | GPS_TIME | DRIVING_DIRECTION | GPS_SPEED | OPERATING_STATUS | |
---|---|---|---|---|---|---|---|
0 | 0006d282be70d06881a7513b69fcaa60 | 24.479755 | 118.146935 | 2019/5/31 1:31:20 | 292 | 0.0 | 1 |
1 | 0006d282be70d06881a7513b69fcaa60 | 24.479755 | 118.146935 | 2019/5/31 1:31:35 | 292 | 0.0 | 1 |
2 | 0006d282be70d06881a7513b69fcaa60 | 24.479755 | 118.146935 | 2019/5/31 1:31:50 | 292 | 0.0 | 1 |
3 | 0006d282be70d06881a7513b69fcaa60 | 24.479755 | 118.146935 | 2019/5/31 1:32:05 | 292 | 0.0 | 1 |
4 | 0006d282be70d06881a7513b69fcaa60 | 24.479755 | 118.146935 | 2019/5/31 1:32:20 | 292 | 0.0 | 1 |
将多天的数据拼接在一起
# 出租车2019年5月31日-6月1日的GPS数据
taxigps2019 = pd.concat([
pd.read_csv(INPUT_PATH + 'taxiGps20190531.csv', nrows=MAX_ROWS,
dtype = {
'DRIVING_DIRECTION': np.uint16,
'OPERATING_STATUS': np.uint8,
'LONGITUDE': np.float32,
'LATITUDE': np.float32,
'GPS_SPEED': np.float16
}),
pd.read_csv(INPUT_PATH + 'taxiGps20190601.csv', nrows=MAX_ROWS,
dtype = {
'DRIVING_DIRECTION': np.uint16,
'OPERATING_STATUS': np.uint8,
'LONGITUDE': np.float32,
'LATITUDE': np.float32,
'GPS_SPEED': np.float16
})
])
taxigps2019 = taxigps2019[taxigps2019.columns[::-1]]
taxigps2019.sort_values(by=['CARNO','GPS_TIME'], inplace=True)
taxigps2019.reset_index(inplace=True)
print(taxigps2019.head())
print(taxigps2019.tail())
index CARNO LATITUDE LONGITUDE \
0 7622 0006d282be70d06881a7513b69fcaa60 24.479755 118.146935
1 11509 0006d282be70d06881a7513b69fcaa60 24.479755 118.146935
2 15763 0006d282be70d06881a7513b69fcaa60 24.479755 118.146935
3 19754 0006d282be70d06881a7513b69fcaa60 24.479755 118.146935
4 23693 0006d282be70d06881a7513b69fcaa60 24.479755 118.146935
GPS_TIME DRIVING_DIRECTION GPS_SPEED OPERATING_STATUS
0 2019/5/31 1:31:20 292 0.0 1
1 2019/5/31 1:31:35 292 0.0 1
2 2019/5/31 1:31:50 292 0.0 1
3 2019/5/31 1:32:05 292 0.0 1
4 2019/5/31 1:32:20 292 0.0 1
index CARNO LATITUDE LONGITUDE \
199995 72693 fff20f025f560278d601b2fd47e1f6b7 24.465818 118.073380
199996 81887 fff20f025f560278d601b2fd47e1f6b7 24.465914 118.071518
199997 84249 fff20f025f560278d601b2fd47e1f6b7 24.465792 118.070946
199998 92843 fff20f025f560278d601b2fd47e1f6b7 24.463242 118.070229
199999 94174 fff20f025f560278d601b2fd47e1f6b7 24.463011 118.070023
GPS_TIME DRIVING_DIRECTION GPS_SPEED OPERATING_STATUS
199995 2019/6/1 0:04:10 266 26.296875 1
199996 2019/6/1 0:04:41 274 26.296875 1
199997 2019/6/1 0:04:49 214 27.203125 1
199998 2019/6/1 0:05:19 198 30.000000 1
199999 2019/6/1 0:05:24 258 27.703125 1
巡游车订单读取
taxiorder2019 = pd.concat([
pd.read_csv(INPUT_PATH + 'taxiOrder20190531.csv', nrows=MAX_ROWS,
dtype = {
'GETON_LONGITUDE': np.float32,
'GETON_LATITUDE': np.float32,
'GETOFF_LONGITUDE': np.float32,
'GETOFF_LATITUDE': np.float32,
'PASS_MILE': np.float16,
'NOPASS_MILE': np.float16,
'WAITING_TIME': np.float16
}),
pd.read_csv(INPUT_PATH + 'taxiOrder20190601.csv', nrows=MAX_ROWS,
dtype = {
'GETON_LONGITUDE': np.float32,
'GETON_LATITUDE': np.float32,
'GETOFF_LONGITUDE': np.float32,
'GETOFF_LATITUDE': np.float32,
'PASS_MILE': np.float16,
'NOPASS_MILE': np.float16,
'WAITING_TIME': np.float16
})
])
taxiorder2019 = taxiorder2019.rename(columns={'CAR_NO':'CARNO'})
taxiorder2019.sort_values(by=['CARNO','GETON_DATE'], inplace=True) # 根据车号和上车时间进行排序
taxiorder2019.reset_index(inplace=True, drop=True)
print(taxiorder2019.head())
print(taxiorder2019.tail())
CARNO GETON_DATE GETON_LONGITUDE \
0 0006d282be70d06881a7513b69fcaa60 2019-05-31 00:08:00 118.155060
1 0006d282be70d06881a7513b69fcaa60 2019-05-31 02:30:00 118.154999
2 0006d282be70d06881a7513b69fcaa60 2019-05-31 07:47:00 118.155083
3 0006d282be70d06881a7513b69fcaa60 2019-05-31 08:19:00 118.146347
4 0006d282be70d06881a7513b69fcaa60 2019-05-31 08:43:00 118.129448
GETON_LATITUDE GETOFF_DATE GETOFF_LONGITUDE GETOFF_LATITUDE \
0 24.506035 2019-05-31 00:13:00 118.176666 24.509895
1 24.488607 2019-05-31 02:46:00 118.188156 24.498117
2 24.506119 2019-05-31 08:00:00 118.181938 24.531134
3 24.515144 2019-05-31 08:32:00 118.112679 24.520956
4 24.496716 2019-05-31 08:56:00 118.157806 24.486753
PASS_MILE NOPASS_MILE WAITING_TIME
0 3.099609 1.299805 62.0
1 7.300781 0.000000 326.0
2 4.398438 16.906250 328.0
3 4.601562 0.000000 254.0
4 3.300781 0.199951 438.0
CARNO GETON_DATE \
199995 fff20f025f560278d601b2fd47e1f6b7 2019-06-01 01:03:00
199996 fff20f025f560278d601b2fd47e1f6b7 2019-06-01 01:19:00
199997 fff20f025f560278d601b2fd47e1f6b7 2019-06-01 03:00:00
199998 fff20f025f560278d601b2fd47e1f6b7 2019-06-01 10:05:00
199999 fff20f025f560278d601b2fd47e1f6b7 2019-06-01 10:44:00
GETON_LONGITUDE GETON_LATITUDE GETOFF_DATE \
199995 118.118797 24.428305 2019-06-01 01:07:00
199996 118.119179 24.430992 2019-06-01 01:35:00
199997 118.173531 24.481377 2019-06-01 03:00:00
199998 118.158691 24.448475 2019-06-01 10:20:00
199999 118.173271 24.463055 2019-06-01 11:00:00
GETOFF_LONGITUDE GETOFF_LATITUDE PASS_MILE NOPASS_MILE \
199995 118.111732 24.440104 2.000000 10.601562
199996 118.170212 24.487490 12.203125 4.601562
199997 118.171600 24.482180 0.099976 30.000000
199998 118.118454 24.479401 8.898438 22.906250
199999 118.088173 24.439388 11.796875 10.500000
WAITING_TIME
199995 59.0
199996 142.0
199997 0.0
199998 179.0
199999 210.0
网约车GPS数据读取
wycgps2019 = pd.read_csv(INPUT_PATH + 'wycGps20190531.csv', nrows=MAX_ROWS,
dtype={
'LONGITUDE': np.float32,
'LATITUDE': np.float32,
'SPEED': np.float16
})
wycgps2019 = wycgps2019.rename(columns={'CAR_NO':'CARNO'})
wycgps2019 = wycgps2019[wycgps2019.columns[::-1]]
wycgps2019.sort_values(by=['CARNO','POSITION_TIME'], inplace=True)
wycgps2019['BIZ_STATUS'] = wycgps2019['BIZ_STATUS'].fillna(-1).astype(np.int8)
wycgps2019['ENCRYPT'] = wycgps2019['ENCRYPT'].fillna(-1).astype(np.int8)
wycgps2019.head()
CARNO | ORDER_ID | BIZ_STATUS | SPEED | DIRECTION | ENCRYPT | LATITUDE | LONGITUDE | POSITION_TIME | |
---|---|---|---|---|---|---|---|---|---|
88732 | 0006d282be70d06881a7513b69fcaa60 | 0 | -1 | 8.0 | 92.0 | 1 | 24.503584 | 118.157280 | 2019/5/31 20:33:05 |
31769 | 0006d282be70d06881a7513b69fcaa60 | 0 | -1 | 8.0 | 85.0 | 1 | 24.477930 | 118.083000 | 2019/5/31 23:17:35 |
58098 | 0006d282be70d06881a7513b69fcaa60 | 0 | -1 | 5.0 | 267.0 | 1 | 24.474464 | 118.082787 | 2019/5/31 23:20:18 |
38891 | 0006d282be70d06881a7513b69fcaa60 | 0 | -1 | 8.0 | 267.0 | 1 | 24.474428 | 118.080521 | 2019/5/31 23:20:49 |
23289 | 0006d282be70d06881a7513b69fcaa60 | 0 | -1 | 18.0 | 298.0 | 1 | 24.493311 | 118.084457 | 2019/5/31 23:25:58 |
网约车订单数据读取
wycorder2019 = pd.read_csv(INPUT_PATH + 'wycOrder20190531.csv', nrows=MAX_ROWS,
dtype={
'DEP_LONGITUDE': np.float32,
'DEP_LATITUDE': np.float32,
'DEST_LONGITUDE': np.float32,
'DEST_LATITUDE': np.float32,
})
wycorder2019 = wycorder2019.rename(columns={'CAR_NO':'CARNO'})
wycorder2019.sort_values(by=['CARNO','DEP_TIME'], inplace=True)
wycorder2019.head()
ORDER_ID | ON_AREA | CARNO | BOOK_DEP_TIME | WAIT_TIME | DEP_LONGITUDE | DEP_LATITUDE | DEP_TIME | DEST_LONGITUDE | DEST_LATITUDE | DEST_TIME | DRIVE_MILE | DRIVE_TIME | WAIT_MILE | ORDER_MATCH_TIME | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
69056 | P190531163151202000 | 350200 | 000ec98fb3522d1d3f3ffd9f43c6617b | 20190531163100 | 0.0 | 118.183853 | 24.527201 | 20190531163743 | 118.164581 | 24.539051 | 20190531164441 | 3.8 | 418 | 0.0 | 20190531185034 |
33401 | P190531190532186001 | 350200 | 000ec98fb3522d1d3f3ffd9f43c6617b | 20190531190500 | 0.0 | 118.087311 | 24.440067 | 20190531191327 | 118.086441 | 24.468336 | 20190531193002 | 5.2 | 995 | 0.0 | 20190531214125 |
36568 | B190531193036068000 | 350200 | 000ec98fb3522d1d3f3ffd9f43c6617b | 20190531193036 | 0.0 | 118.082085 | 24.461121 | 20190531194114 | 118.119873 | 24.477873 | 20190531195730 | 5.4 | 976 | 0.0 | 20190531220126 |
58463 | B190531195846071000 | 350200 | 000ec98fb3522d1d3f3ffd9f43c6617b | 20190531195846 | 0.0 | 118.117836 | 24.476557 | 20190531200107 | 118.155640 | 24.508106 | 20190531201939 | 8.0 | 1112 | 0.1 | 20190531222130 |
5912 | P190531202421202000 | 350200 | 000ec98fb3522d1d3f3ffd9f43c6617b | 20190531202400 | 0.0 | 118.150558 | 24.509783 | 20190531202734 | 118.153320 | 24.479868 | 20190531203854 | 4.8 | 680 | 0.0 | 20190531230145 |
数据统计
对5月31日的数据进行统计
# 巡游车 GPS 数据
taxigps2019 = pd.read_csv(INPUT_PATH + 'taxiGps20190531.csv',
dtype = {
'DRIVING_DIRECTION': np.uint16,
'OPERATING_STATUS': np.uint8,
'LONGITUDE': np.float32,
'LATITUDE': np.float32,
'GPS_SPEED': np.float16
})
taxigps2019 = taxigps2019[taxigps2019.columns[::-1]] # 把列的顺序反过来
taxigps2019.sort_values(by=['CARNO','GPS_TIME'], inplace=True) # 根据车号以及GPS_TIME进行排序
taxigps2019.reset_index(inplace=True, drop=True)
# 巡游车订单数据
taxiorder2019 = pd.read_csv(INPUT_PATH + 'taxiOrder20190531.csv',
dtype = {
'GETON_LONGITUDE': np.float32,
'GETON_LATITUDE': np.float32,
'GETOFF_LONGITUDE': np.float32,
'GETOFF_LATITUDE': np.float32,
'PASS_MILE': np.float16,
'NOPASS_MILE': np.float16,
'WAITING_TIME': np.float16
})
taxiorder2019 = taxiorder2019.rename(columns={'CAR_NO':'CARNO'})
taxiorder2019.sort_values(by=['CARNO','GETON_DATE'], inplace=True) # 根据车号和上车时间进行排序
taxiorder2019.reset_index(inplace=True, drop=True)
# 网约车 GPS 数据
wycgps2019 = pd.read_csv(INPUT_PATH + 'wycGps20190531.csv',
dtype={
'LONGITUDE': np.float32,
'LATITUDE': np.float32,
'SPEED': np.float16
})
wycgps2019 = wycgps2019.rename(columns={'CAR_NO':'CARNO'})
wycgps2019 = wycgps2019[wycgps2019.columns[::-1]]
wycgps2019.sort_values(by=['CARNO','POSITION_TIME'], inplace=True)
wycgps2019['BIZ_STATUS'] = wycgps2019['BIZ_STATUS'].fillna(-1).astype(np.int8)
wycgps2019['ENCRYPT'] = wycgps2019['ENCRYPT'].fillna(-1).astype(np.int8)
# 网约车订单数据
wycorder2019 = pd.read_csv(INPUT_PATH + 'wycOrder20190531.csv',
dtype={
'DEP_LONGITUDE': np.float32,
'DEP_LATITUDE': np.float32,
'DEST_LONGITUDE': np.float32,
'DEST_LATITUDE': np.float32,
})
wycorder2019 = wycorder2019.rename(columns={'CAR_NO':'CARNO'})
wycorder2019.sort_values(by=['CARNO','DEP_TIME'], inplace=True)
- 有多少辆巡游车:
taxigps2019['CARNO'].nunique() # 获取唯一值的统计次数
6768
- 有多少辆网约车:
wycgps2019['CARNO'].nunique() # 获取唯一值的统计次数
27849
- 巡游车平均GPS速度:
np.clip(taxigps2019['GPS_SPEED'].values, 0, 150).mean()
# clip 将数组中的元素限制在 [a_min, a_max],大于 a_max 的就使得它等于 a_max,小于 a_min 的就使得它等于 a_min
17.42
- 巡游车运营状态统计:
taxigps2019['OPERATING_STATUS'].value_counts()
# 查看表格的某列有多少个不同的值,并计算每个值在该列出现的次数
1 9540708
6 6916642
8 371497
Name: OPERATING_STATUS, dtype: int64
- 查看某辆巡游车的数据:
taxigps2019[taxigps2019['CARNO'] == '0006d282be70d06881a7513b69fcaa60'].head()
CARNO | LATITUDE | LONGITUDE | GPS_TIME | DRIVING_DIRECTION | GPS_SPEED | OPERATING_STATUS | |
---|---|---|---|---|---|---|---|
0 | 0006d282be70d06881a7513b69fcaa60 | 24.506195 | 118.128929 | 2019/5/31 10:00:11 | 12 | 9.203125 | 1 |
1 | 0006d282be70d06881a7513b69fcaa60 | 24.506210 | 118.128937 | 2019/5/31 10:00:26 | 10 | 0.000000 | 1 |
2 | 0006d282be70d06881a7513b69fcaa60 | 24.506210 | 118.128937 | 2019/5/31 10:00:41 | 10 | 0.000000 | 1 |
3 | 0006d282be70d06881a7513b69fcaa60 | 24.506210 | 118.128937 | 2019/5/31 10:00:56 | 10 | 0.000000 | 1 |
4 | 0006d282be70d06881a7513b69fcaa60 | 24.506210 | 118.128937 | 2019/5/31 10:01:11 | 10 | 0.000000 | 1 |
- 统计某个运行方向的车辆:
len(taxigps2019[taxigps2019['DRIVING_DIRECTION'] == 10]['CARNO'].unique())
6268
- 统计最多的GPS小时(大部分数据都是几点):
taxigps2019['GPS_TIME'] = pd.to_datetime(taxigps2019['GPS_TIME'])
taxigps2019['GPS_TIME'].dt.hour.value_counts()
23 1059242
22 1054752
10 1054140
9 1053138
21 1047899
20 1042931
8 1042160
19 1039729
17 1038851
18 1033548
7 1002225
6 903491
2 893999
4 859305
3 852674
5 816163
1 457019
11 439676
16 131282
0 2273
15 1394
14 1285
13 851
12 820
Name: GPS_TIME, dtype: int64
-
统计某个巡游车的在一天的载客时间:
∑ \sum ∑ 下车时间-上车时间
temp = taxiorder2019[taxiorder2019['CARNO'] == '0006d282be70d06881a7513b69fcaa60']
dtime = pd.to_datetime(temp['GETOFF_DATE'])-pd.to_datetime(temp['GETON_DATE'])
dtime.sum()
Timedelta('0 days 11:04:00')
- 统计某个巡游车的在一天的有效运行时间:(以分钟为单位)
temp = taxigps2019[taxigps2019['CARNO'] == '0006d282be70d06881a7513b69fcaa60']
run_time = temp['GPS_TIME'].dt.hour.astype(str) +temp['GPS_TIME'].dt.minute.astype(str)
run_time.nunique()
992
-
统计某个巡游车的行驶距离:
平均 GPS 速度 * 行驶时间
temp = taxigps2019[taxigps2019['CARNO'] == '0006d282be70d06881a7513b69fcaa60']
mean_speed = np.clip(temp['GPS_SPEED'].values, 0, 150).mean()
run_time = max(temp['GPS_TIME'])-min(temp['GPS_TIME'])
(run_time/ np.timedelta64(1, 'h')) * mean_speed
389.4881163194445
- 统计网约车订单数据上车经纬度的最大最小值:
print('上车经度最大值',end=': ')
print(taxiorder2019['GETON_LONGITUDE'].max())
print('上车经度最小值',end=': ')
print(taxiorder2019[taxiorder2019['GETON_LONGITUDE'] > 0]['GETON_LONGITUDE'].min())
print('上车纬度最大值',end=': ')
print(taxiorder2019['GETON_LATITUDE'].max())
print('上车纬度最小值',end=': ')
print(taxiorder2019[taxiorder2019['GETON_LATITUDE'] > 0]['GETON_LATITUDE'].min())
print('下车经度最大值',end=': ')
print(taxiorder2019['GETOFF_LONGITUDE'].max())
print('下车经度最小值',end=': ')
print(taxiorder2019[taxiorder2019['GETOFF_LONGITUDE'] > 0]['GETOFF_LONGITUDE'].min())
print('下车纬度最大值',end=': ')
print(taxiorder2019['GETOFF_LATITUDE'].max())
print('下车纬度最小值',end=': ')
print(taxiorder2019[taxiorder2019['GETOFF_LATITUDE'] > 0]['GETOFF_LATITUDE'].min())
上车经度最大值: 126.43206
上车经度最小值: 108.93546
上车纬度最大值: 34.656197
上车纬度最小值: 12.239963
下车经度最大值: 126.995125
下车经度最小值: 100.401085
下车纬度最大值: 34.69632
下车纬度最小值: 12.239963
- 统计网约车订单数据集在中下车经纬度最常见的位置(经度+维度,各保留三位有效数字组合得到具体位置):
f = lambda x:str(round(x,3))
taxiorder2019['GETOFF_POSITION'] = taxiorder2019['GETOFF_LONGITUDE'].apply(f)+','+taxiorder2019['GETOFF_LATITUDE'].apply(f)
taxiorder2019['GETOFF_POSITION'].value_counts()
0.0,0.0 3340
118.144,24.547 1860
118.07,24.638 1786
118.127,24.537 1731
118.116,24.481 1437
...
118.114,24.678 1
118.023,24.47 1
118.123,24.524 1
118.161,24.721 1
117.98,24.477 1
Name: GETOFF_POSITION, Length: 13267, dtype: int64
GPS 定位经纬度都为0的点是异常值(对应真实的物理坐标在非洲),可能是由于没有开启GPS定位功能、系统故障或者是没有网络造成的。
-
找到数据中的异常值:
切比雪夫定理:可用于判断异常值
约 75% 的数据,位于平均数两个标准差范围内
约 89% 的数据,位于平均数三个标准差范围内
约 96% 的数据,位于平均数五个标准差范围内
-
对于异常值可以利用插值的方法重新赋值
lb = taxiorder2019['GETOFF_LONGITUDE'].mean()-5*taxiorder2019['GETOFF_LONGITUDE'].std()
ub = taxiorder2019['GETOFF_LONGITUDE'].mean()+5*taxiorder2019['GETOFF_LONGITUDE'].std()
# 找到异常值
taxiorder2019[(taxiorder2019['GETOFF_LONGITUDE']>ub) | (taxiorder2019['GETOFF_LONGITUDE']<lb)]['GETOFF_LONGITUDE'] = np.nan
# 进行插值
taxiorder2019['GETOFF_LONGITUDE'] = taxiorder2019['GETOFF_LONGITUDE'].interpolate()