朴素贝叶斯对新闻进行预测分类
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
#朴素贝叶斯对新闻进行预测
news=fetch_20newsgroups(subset="all")
print(news)
#获取数据和标签
x=news.data[:10000]
y=news.target[:10000]
#切分训练集和测试集
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.25,random_state=33)
#转化为词向量
vec=CountVectorizer()
train_x=vec.fit_transform(train_x)
test_x=vec.transform(test_x)
#初始化朴树贝叶斯模型
mnb=MultinomialNB()
mnb.fit(train_x,train_y)
#预测
y_pre=mnb.predict(test_x)
print("准确率:",mnb.score(test_x, test_y))
print("分类报告:",classification_report(test_y, y_pre))
运行结果如下:
准确率: 0.8192
分类报告: precision recall f1-score support
0 0.83 0.84 0.83 99
1 0.80 0.79 0.80 146
2 1.00 0.13 0.23 125
3 0.57 0.84 0.68 112
4 0.90 0.72 0.80 134
5 0.59 0.93 0.72 123
6 0.95 0.55 0.70 136
7 0.84 0.87 0.85 126
8 0.95 0.92 0.94 127
9 0.98 0.94 0.96 128
10 0.95 0.96 0.95 113
11 0.70 0.99 0.82 138
12 0.85 0.76 0.80 136
13 0.92 0.90 0.91 136
14 0.88 0.93 0.91 132
15 0.76 0.98 0.86 125
16 0.80 0.94 0.87 144
17 0.89 0.95 0.92 130
18 0.87 0.84 0.86 102
19 0.92 0.50 0.65 88
accuracy 0.82 2500
macro avg 0.85 0.81 0.80 2500
weighted avg 0.85 0.82 0.81 2500