利用朴素贝叶斯进行垃圾邮件分类
朴素贝叶斯在文本分类、垃圾邮件识别、情感分析等方面有很好的应用。进来就来体验一下,数据借用了https://github.com/shijing888/BayesSpam,他文件夹里面有分类好的正常邮件,垃圾邮件和测试邮件。也有代码,不过我这个代码我自己写的。
主要过程:读取邮件-分词-转化为词袋模型、调用朴素贝叶斯进行分类。代码比较乱,直接上代码:
import jieba,re
import os
import numpy as np
from sklearn.naive_bayes import BernoulliNB
filepath=r'C:\Users\lbship\Desktop\脚本\BayesSpam-master\data\\'
#获取停用词,用set去重再转化为列表
stopword=list(set('sep'.join(i for i in open(r'C:\Users\lbship\Desktop\脚本\BayesSpam-master\data\中文停用词表.txt','r').read()).split('sep')))+['很不讨','临走前','装腔作势','宽宏大量','体格检查','考试成绩','普通高校','为情所困','乐不可支']
#获取目录下所有文件的名称
def get_filelist(filetype):
return os.listdir(filepath+filetype)
#过滤掉停用词,返回结巴分词后的列表
def pretreat(content):
words=[]
contents=jieba.lcut(content)
for i in contents:
if i not in stopword:
words.append(i)
return words
#提取中文字符,返回整个文件夹的词列表
def get_wordlist(filetype,filelist):
wordlist=[]
for email in filelist:
onemailword=''
for s in open(filepath+filetype+email):
s=re.sub(re.compile(r"[^\u4e00-\u9fa5]"),'',s)#过滤掉非中文字符
onemailword+=s #把整个文档中文连接起来
onemailword=pretreat(onemailword)
wordlist.append(onemailword)
return wordlist
# #把所有词放到一个列表中,用set去重
def get_wordset(train_x):
wordsets=[]
for i in train_x:
for j in i:
wordsets.append(j)
wordsets=list(set(wordsets))
return wordsets
#生成词袋向量或者词集
def create_wordVec(sample, wordSet, mode="wordSet"):
length = len(wordSet)
wordVec = [0] * len(wordSet)
if mode == "wordSet":
for i in range(length):
if wordSet[i] in sample:
wordVec[i] = 1
elif mode == "wordBag":
for i in range(length):
for j in range(len(sample)):
if sample[j] == wordSet[i]:
wordVec[i] += 1
else:
raise (Exception("The mode must be wordSet or wordBag."))
return wordVec
#转化为矩阵
def get_matrix(trainx,wordsets,mode="wordSet"):
train_matrix = []
for i in range(len(trainx)):
train_matrix.append(create_wordVec(trainx[i], wordsets, "wordSet"))
return train_matrix
#获取测试机标签
def get_testlabel(testfile):
testlabel={}
for j in testfile:
if int(j)>1000:
testlabel[j]=1
else:
testlabel[j]=0
testlabel=list(testlabel.values())
return testlabel
def main():
#读取文件名
norfile = get_filelist('normal')[0:100] # 自己设置读取多少数据作为训练库
spamfile = get_filelist('spam')[0:100]
testfile = get_filelist('test')[0:151]
# 返回各个文件夹下文档的分词列表
norlist = get_wordlist('normal\\', norfile)
spamlist = get_wordlist('spam\\', spamfile)
testlist = get_wordlist('test\\', testfile)
# 生成训练库和标签
train_x = norlist + spamlist
label = [0] * len(norfile) + [1] * len(spamfile)
wordSet = get_wordset(train_x)
testwordSet = get_wordset(testlist)
train_matrix = get_matrix(train_x, wordSet, "wordSet")
test_matrix = get_matrix(testlist, testwordSet, "wordSet")
#转化为矩阵
xtrain=np.array(train_matrix)
ytrain=np.array(label)
testtrain=np.array(test_matrix)
testlabel=np.array(get_testlabel(testfile))
#计算正确率
clf=BernoulliNB()
clf.fit(xtrain,ytrain)
print('测试样本的正确率为{0:.2f}%'.format(clf.score(testtrain,testlabel)*100))
if __name__ == '__main__':
main()