文本清理:在自然语言处理中,尽管文本清理受所做的任务影响比较大,但是有一些通用的清理流程标准是通用的,比如是否有必要替换单位、货币、数学符号、数字。可以使用正则化工具将相应内容替换为标准内容。
工具:re(简介)
输入:原始文本
输出:干净文本
流程:单位替换 、略缩词替换、拼写校对、标点处理、符号替换、移除多余空格
代码:notebook
单位替换
将文本中的单位替换为统一格式如:将4kgs、4kg统一替换为4 kg,将4k替换为4000,将 100 或 100 100或100 100或100替换为100 dollar。
import random
import re
text = "I want to lose 4kgs in a month. What does 4k mean in a salary?What is the best way to make money with $100?"
# 单位
text = re.sub(r"(\d+)kgs ", lambda m: m.group(1) + ' kg ', text) # e.g. 4kgs => 4 kg
text = re.sub(r"(\d+)kg ", lambda m: m.group(1) + ' kg ', text) # e.g. 4kg => 4 kg
text = re.sub(r"(\d+)k ", lambda m: m.group(1) + '000 ', text) # e.g. 4k => 4000
text = re.sub(r"\$(\d+)", lambda m: m.group(1) + ' dollar ', text) # e.g. $100 => 100 dollar
text = re.sub(r"(\d+)\$", lambda m: m.group(1) + ' dollar ', text) # e.g. 100$ => 100 dollar
text
'I want to lose 4 kg in a month. What does 4000 mean in a salary?What is the best way to make money with 100 dollar ?'
略缩词替换
将文本中首字母略缩词替换为完整单词,如can’t、cannot替换为can not,'ve替换为have,c#替换为csharp等。
text = "Why India can't compete with China in manufacturing. What is the biggest scam you've ever seen? Why Should I Learn c#? "
# 略缩词
text = re.sub(r"can\'t", "can not", text)
text = re.sub(r"cannot", "can not ", text)
text = re.sub(r"what\'s", "what is", text)
text = re.sub(r"What\'s", "what is", text)
text = re.sub(r"\'ve ", " have ", text)
text = re.sub(r"n\'t", " not ", text)
text = re.sub(r"i\'m", "i am ", text)
text = re.sub(r"I\'m", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r"c\+\+", "cplusplus", text)
text = re.sub(r"c \+\+", "cplusplus", text)
text = re.sub(r"c \+ \+", "cplusplus", text)
text = re.sub(r"c#", "csharp", text)
text = re.sub(r"f#", "fsharp", text)
text = re.sub(r"g#", "gsharp", text)
text = re.sub(r" e mail ", " email ", text)
text = re.sub(r" e \- mail ", " email ", text)
text = re.sub(r" e\-mail ", " email ", text)
text = re.sub(r",000", '000', text)
text = re.sub(r"\'s", " ", text)
text
'Why India can not compete with China in manufacturing. What is the biggest scam you have ever seen? Why Should I Learn csharp? '
拼写校对
如将ph.d、PhD替换为phd,去掉多余空格,将缩写替换为全拼、将阿拉伯数字替换为英文数字、将美元复数替换为单数等。
text = "ph.d PhD pokemons e g fb usa 1 2 3 googling rs1 dollars"
# 拼写校对
text = re.sub(r"ph\.d", "phd", text)
text = re.sub(r"PhD", "phd", text)
text = re.sub(r"pokemons", "pokemon", text)
text = re.sub(r"pokémon", "pokemon", text)
text = re.sub(r"pokemon go ", "pokemon-go ", text)
text = re.sub(r" e g ", " eg ", text)
text = re.sub(r" b g ", " bg ", text)
text = re.sub(r" 9 11 ", " 911 ", text)
text = re.sub(r" j k ", " jk ", text)
text = re.sub(r" fb ", " facebook ", text)
text = re.sub(r"facebooks", " facebook ", text)
text = re.sub(r"facebooking", " facebook ", text)
text = re.sub(r"insidefacebook", "inside facebook", text)
text = re.sub(r"donald trump", "trump", text)
text = re.sub(r"the big bang", "big-bang", text)
text = re.sub(r"the european union", "eu", text)
text = re.sub(r" usa ", " america ", text)
text = re.sub(r" us ", " america ", text)
text = re.sub(r" u s ", " america ", text)
text = re.sub(r" U\.S\. ", " america ", text)
text = re.sub(r" US ", " america ", text)
text = re.sub(r" American ", " america ", text)
text = re.sub(r" America ", " america ", text)
text = re.sub(r" quaro ", " quora ", text)
text = re.sub(r" mbp ", " macbook-pro ", text)
text = re.sub(r" mac ", " macbook ", text)
text = re.sub(r"macbook pro", "macbook-pro", text)
text = re.sub(r"macbook-pros", "macbook-pro", text)
text = re.sub(r" 1 ", " one ", text)
text = re.sub(r" 2 ", " two ", text)
text = re.sub(r" 3 ", " three ", text)
text = re.sub(r" 4 ", " four ", text)
text = re.sub(r" 5 ", " five ", text)
text = re.sub(r" 6 ", " six ", text)
text = re.sub(r" 7 ", " seven ", text)
text = re.sub(r" 8 ", " eight ", text)
text = re.sub(r" 9 ", " nine ", text)
text = re.sub(r"googling", " google ", text)
text = re.sub(r"googled", " google ", text)
text = re.sub(r"googleable", " google ", text)
text = re.sub(r"googles", " google ", text)
text = re.sub(r" rs(\d+)", lambda m: ' rs ' + m.group(1), text)
text = re.sub(r"(\d+)rs", lambda m: ' rs ' + m.group(1), text)
text = re.sub(r"the european union", " eu ", text)
text = re.sub(r"dollars", " dollar ", text)
text
'phd phd pokemon eg facebook america one two three google rs 1 dollar '
标点处理
在标点两旁加上空格、去除标点’。
text = "1+1=2 What is the biggest scam you have ever seen?I am learning csharp?"
# 标点处理
text = re.sub(r"\+", " + ", text)
text = re.sub(r"'", " ", text)
text = re.sub(r"-", " - ", text)
text = re.sub(r"/", " / ", text)
text = re.sub(r"\\", " \ ", text)
text = re.sub(r"=", " = ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r":", " : ", text)
text = re.sub(r"\.", " . ", text)
text = re.sub(r",", " , ", text)
text = re.sub(r"\?", " ? ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\"", " \" ", text)
text = re.sub(r"&", " & ", text)
text = re.sub(r"\|", " | ", text)
text = re.sub(r";", " ; ", text)
text = re.sub(r"\(", " ( ", text)
text = re.sub(r"\)", " ( ", text)
text
'1 + 1 = 2 What is the biggest scam you have ever seen ? I am learning csharp ? '
符号替换
将逻辑符号替换为单词。
text = "1 + 1 = 2 ₹ "
# 符号替换
text = re.sub(r"&", " and ", text)
text = re.sub(r"\|", " or ", text)
text = re.sub(r"=", " equal ", text)
text = re.sub(r"\+", " plus ", text)
text = re.sub(r"₹", " rs ", text) # 测试!
text = re.sub(r"\$", " dollar ", text)
text
'1 plus 1 equal 2 rs '
移除多余空格
# 移除多余空格
text = ' '.join(text.split())
text
'1 plus 1 equal 2 rs'
所有代码
def clean_text(text):
"""
Clean text
:param text: the string of text
:return: text string after cleaning
"""
# unit
text = re.sub(r"(\d+)kgs ", lambda m: m.group(1) + ' kg ', text) # e.g. 4kgs => 4 kg
text = re.sub(r"(\d+)kg ", lambda m: m.group(1) + ' kg ', text) # e.g. 4kg => 4 kg
text = re.sub(r"(\d+)k ", lambda m: m.group(1) + '000 ', text) # e.g. 4k => 4000
text = re.sub(r"\$(\d+)", lambda m: m.group(1) + ' dollar ', text)
text = re.sub(r"(\d+)\$", lambda m: m.group(1) + ' dollar ', text)
# acronym
text = re.sub(r"can\'t", "can not", text)
text = re.sub(r"cannot", "can not ", text)
text = re.sub(r"what\'s", "what is", text)
text = re.sub(r"What\'s", "what is", text)
text = re.sub(r"\'ve ", " have ", text)
text = re.sub(r"n\'t", " not ", text)
text = re.sub(r"i\'m", "i am ", text)
text = re.sub(r"I\'m", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r"c\+\+", "cplusplus", text)
text = re.sub(r"c \+\+", "cplusplus", text)
text = re.sub(r"c \+ \+", "cplusplus", text)
text = re.sub(r"c#", "csharp", text)
text = re.sub(r"f#", "fsharp", text)
text = re.sub(r"g#", "gsharp", text)
text = re.sub(r" e mail ", " email ", text)
text = re.sub(r" e \- mail ", " email ", text)
text = re.sub(r" e\-mail ", " email ", text)
text = re.sub(r",000", '000', text)
text = re.sub(r"\'s", " ", text)
# spelling correction
text = re.sub(r"ph\.d", "phd", text)
text = re.sub(r"PhD", "phd", text)
text = re.sub(r"pokemons", "pokemon", text)
text = re.sub(r"pokémon", "pokemon", text)
text = re.sub(r"pokemon go ", "pokemon-go ", text)
text = re.sub(r" e g ", " eg ", text)
text = re.sub(r" b g ", " bg ", text)
text = re.sub(r" 9 11 ", " 911 ", text)
text = re.sub(r" j k ", " jk ", text)
text = re.sub(r" fb ", " facebook ", text)
text = re.sub(r"facebooks", " facebook ", text)
text = re.sub(r"facebooking", " facebook ", text)
text = re.sub(r"insidefacebook", "inside facebook", text)
text = re.sub(r"donald trump", "trump", text)
text = re.sub(r"the big bang", "big-bang", text)
text = re.sub(r"the european union", "eu", text)
text = re.sub(r" usa ", " america ", text)
text = re.sub(r" us ", " america ", text)
text = re.sub(r" u s ", " america ", text)
text = re.sub(r" U\.S\. ", " america ", text)
text = re.sub(r" US ", " america ", text)
text = re.sub(r" American ", " america ", text)
text = re.sub(r" America ", " america ", text)
text = re.sub(r" quaro ", " quora ", text)
text = re.sub(r" mbp ", " macbook-pro ", text)
text = re.sub(r" mac ", " macbook ", text)
text = re.sub(r"macbook pro", "macbook-pro", text)
text = re.sub(r"macbook-pros", "macbook-pro", text)
text = re.sub(r" 1 ", " one ", text)
text = re.sub(r" 2 ", " two ", text)
text = re.sub(r" 3 ", " three ", text)
text = re.sub(r" 4 ", " four ", text)
text = re.sub(r" 5 ", " five ", text)
text = re.sub(r" 6 ", " six ", text)
text = re.sub(r" 7 ", " seven ", text)
text = re.sub(r" 8 ", " eight ", text)
text = re.sub(r" 9 ", " nine ", text)
text = re.sub(r"googling", " google ", text)
text = re.sub(r"googled", " google ", text)
text = re.sub(r"googleable", " google ", text)
text = re.sub(r"googles", " google ", text)
text = re.sub(r" rs(\d+)", lambda m: ' rs ' + m.group(1), text)
text = re.sub(r"(\d+)rs", lambda m: ' rs ' + m.group(1), text)
text = re.sub(r"the european union", " eu ", text)
text = re.sub(r"dollars", " dollar ", text)
# punctuation
text = re.sub(r"\+", " + ", text)
text = re.sub(r"'", " ", text)
text = re.sub(r"-", " - ", text)
text = re.sub(r"/", " / ", text)
text = re.sub(r"\\", " \ ", text)
text = re.sub(r"=", " = ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r":", " : ", text)
text = re.sub(r"\.", " . ", text)
text = re.sub(r",", " , ", text)
text = re.sub(r"\?", " ? ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\"", " \" ", text)
text = re.sub(r"&", " & ", text)
text = re.sub(r"\|", " | ", text)
text = re.sub(r";", " ; ", text)
text = re.sub(r"\(", " ( ", text)
text = re.sub(r"\)", " ( ", text)
# symbol replacement
text = re.sub(r"&", " and ", text)
text = re.sub(r"\|", " or ", text)
text = re.sub(r"=", " equal ", text)
text = re.sub(r"\+", " plus ", text)
text = re.sub(r"₹", " rs ", text) # 测试!
text = re.sub(r"\$", " dollar ", text)
# remove extra space
text = ' '.join(text.split())
return text
调用
text = "I want to lose 4kgs in a month. What does 4k mean in a salary?What is the best way to make money with $100? Why India can't compete with China in manufacturing. What is the biggest scam you've ever seen? Why Should I Learn c#? ph.d PhD pokemons e g fb usa 1 2 3 googling rs1 dollars 1 +1= 2 ₹ ₹ ₹₹₹₹"
text
"I want to lose 4kgs in a month. What does 4k mean in a salary?What is the best way to make money with $100? Why India can't compete with China in manufacturing. What is the biggest scam you've ever seen? Why Should I Learn c#? ph.d PhD pokemons e g fb usa 1 2 3 googling rs1 dollars 1 +1= 2 ₹ ₹ ₹₹₹₹"
clean_text(text)
'I want to lose four kg in a month . What does 4000 mean in a salary ? What is the best way to make money with 100 dollar ? Why India can not compete with China in manufacturing . What is the biggest scam you have ever seen ? Why Should I Learn csharp ? phd phd pokemon eg facebook america one two three google rs 1 dollar one plus 1 equal two rs rs rs rs rs rs'