数据不一致
解决一些数据不一致问题
例如:“usa” 和 "USA"是一个东西,让它们在数据中变成一样。
# modules we'll use
import pandas as pd
import numpy as np
# helpful modules
import fuzzywuzzy
from fuzzywuzzy import process
import charset_normalizer
# read in all our data
professors = pd.read_csv("../input/pakistan-intellectual-capital/pakistan_intellectual_capital.csv")
# set seed for reproducibility
np.random.seed(0)
# get all the unique values in the 'Country' column
countries = professors['Country'].unique()
# sort them alphabetically and then take a closer look
countries.sort()
countries
array([' Germany', ' New Zealand', ' Sweden', ' USA', 'Australia',
'Austria', 'Canada', 'China', 'Finland', 'France', 'Greece',
'HongKong', 'Ireland', 'Italy', 'Japan', 'Macau', 'Malaysia',
'Mauritius', 'Netherland', 'New Zealand', 'Norway', 'Pakistan',
'Portugal', 'Russian Federation', 'Saudi Arabia', 'Scotland',
'Singapore', 'South Korea', 'SouthKorea', 'Spain', 'Sweden',
'Thailand', 'Turkey', 'UK', 'USA', 'USofA', 'Urbana', 'germany'],
dtype=object)
# convert to lower case
# 全部转变为小写
professors['Country'] = professors['Country'].str.lower()
# remove trailing white spaces
# 去掉尾部的空格
professors['Country'] = professors['Country'].str.strip()
使用 fuzzy 库
Fuzzy matching ——模糊匹配
# get the top 10 closest matches to "south korea"
matches = fuzzywuzzy.process.extract("south korea", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
# take a look at them
matches
[('south korea', 100),
('southkorea', 48),
('saudi arabia', 43),
('norway', 35),
('austria', 33),
('ireland', 33),
('pakistan', 32),
('portugal', 32),
('scotland', 32),
('australia', 30)]
# function to replace rows in the provided column of the provided dataframe
# that match the provided string above the provided ratio with the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
# get a list of unique strings
strings = df[column].unique()
# get the top 10 closest matches to our input string
matches = fuzzywuzzy.process.extract(string_to_match, strings,
limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
# only get matches with a ratio > 90
close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]
# get the rows of all the close matches in our dataframe
rows_with_matches = df[column].isin(close_matches)
# replace all rows with close matches with the input matches
df.loc[rows_with_matches, column] = string_to_match
# let us know the function's done
print("All done!")
# use the function we just wrote to replace close matches to "south korea" with "south korea"
replace_matches_in_column(df=professors, column='Country', string_to_match="south korea")