利用朴素贝叶斯进行垃圾邮件分类

朴素贝叶斯在文本分类、垃圾邮件识别、情感分析等方面有很好的应用。进来就来体验一下,数据借用了https://github.com/shijing888/BayesSpam,他文件夹里面有分类好的正常邮件,垃圾邮件和测试邮件。也有代码,不过我这个代码我自己写的。

主要过程:读取邮件-分词-转化为词袋模型、调用朴素贝叶斯进行分类。代码比较乱,直接上代码:

import jieba,re  
import os  
import numpy as np  
from sklearn.naive_bayes import BernoulliNB  
filepath=r'C:\Users\lbship\Desktop\脚本\BayesSpam-master\data\\'  
#获取停用词,用set去重再转化为列表  
stopword=list(set('sep'.join(i for i in open(r'C:\Users\lbship\Desktop\脚本\BayesSpam-master\data\中文停用词表.txt','r').read()).split('sep')))+['很不讨','临走前','装腔作势','宽宏大量','体格检查','考试成绩','普通高校','为情所困','乐不可支']  
#获取目录下所有文件的名称  
def get_filelist(filetype):  
    return os.listdir(filepath+filetype)  
#过滤掉停用词,返回结巴分词后的列表  
def pretreat(content):  
    words=[]  
    contents=jieba.lcut(content)  
    for i in contents:  
        if i not in stopword:  
            words.append(i)  
    return words  
#提取中文字符,返回整个文件夹的词列表  
def get_wordlist(filetype,filelist):  
    wordlist=[]  
    for email in filelist:  
        onemailword=''  
        for s in open(filepath+filetype+email):  
            s=re.sub(re.compile(r"[^\u4e00-\u9fa5]"),'',s)#过滤掉非中文字符  
            onemailword+=s #把整个文档中文连接起来  
        onemailword=pretreat(onemailword)  
        wordlist.append(onemailword)  
    return wordlist  
# #把所有词放到一个列表中,用set去重  
def get_wordset(train_x):  
    wordsets=[]  
    for i in train_x:  
        for j in i:  
            wordsets.append(j)  
    wordsets=list(set(wordsets))  
    return wordsets  
#生成词袋向量或者词集  
def create_wordVec(sample, wordSet, mode="wordSet"):  
    length = len(wordSet)  
    wordVec = [0] * len(wordSet)  
    if mode == "wordSet":  
        for i in range(length):  
            if wordSet[i] in sample:  
                wordVec[i] = 1  
    elif mode == "wordBag":  
        for i in range(length):  
            for j in range(len(sample)):  
                if sample[j] == wordSet[i]:  
                    wordVec[i] += 1  
    else:  
        raise (Exception("The mode must be wordSet or wordBag."))  
    return wordVec  
#转化为矩阵  
def get_matrix(trainx,wordsets,mode="wordSet"):  
    train_matrix = []  
    for i in range(len(trainx)):  
        train_matrix.append(create_wordVec(trainx[i], wordsets, "wordSet"))  
    return train_matrix  
#获取测试机标签  
def get_testlabel(testfile):  
    testlabel={}  
    for j in testfile:  
        if int(j)>1000:  
            testlabel[j]=1  
        else:  
            testlabel[j]=0  
    testlabel=list(testlabel.values())  
    return testlabel  
def main():  
    #读取文件名  
    norfile = get_filelist('normal')[0:100]  # 自己设置读取多少数据作为训练库  
    spamfile = get_filelist('spam')[0:100]  
    testfile = get_filelist('test')[0:151]  
    # 返回各个文件夹下文档的分词列表  
    norlist = get_wordlist('normal\\', norfile)  
    spamlist = get_wordlist('spam\\', spamfile)  
    testlist = get_wordlist('test\\', testfile)  
    # 生成训练库和标签  
    train_x = norlist + spamlist  
    label = [0] * len(norfile) + [1] * len(spamfile)  
    wordSet = get_wordset(train_x)  
    testwordSet = get_wordset(testlist)  
    train_matrix = get_matrix(train_x, wordSet, "wordSet")  
    test_matrix = get_matrix(testlist, testwordSet, "wordSet")  
    #转化为矩阵  
    xtrain=np.array(train_matrix)  
    ytrain=np.array(label)  
    testtrain=np.array(test_matrix)  
    testlabel=np.array(get_testlabel(testfile))  
    #计算正确率  
    clf=BernoulliNB()  
    clf.fit(xtrain,ytrain)  
    print('测试样本的正确率为{0:.2f}%'.format(clf.score(testtrain,testlabel)*100))  
if __name__ == '__main__':  
    main()  

image