Please enable Javascript to view the contents

机器学习实践 - 垃圾邮件分类器

 ·  ☕ 2 分钟

数据源:SMS Spam Collection

代码

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import time
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb


def runplt():
    '''
    绘制 ROC 曲线的准备工作
    '''
    fig, ax = plt.subplots()
    ax.set_title('ROC curve and AUC')
    ax.set_xlabel('FPR (False Positive Rate)')
    ax.set_ylabel('TPR (True Positive Rate)')
    ax.plot([0, 1], [0, 1], color='navy', ls='--', label='random: 0.5')
    ax.plot([0, 0, 1, 1], [0, 1, 1, 1], color='forestgreen', ls='--', label='perfect: 1')
    return ax

def main():
    '''
    任务:垃圾邮件分类
    '''
    # 加载数据集
    df = pd.read_table('/path/to/SMSSpamCollection.txt', header=None)
    feature, target = df[1], df[0]
    
    # 拆分训练集
    X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.3, random_state=777)
    print('>>> 训练集:{} 组, 测试集:{} 组 <<<'.format(X_train.shape[0], X_test.shape[0]))
    
    # 预处理: 处理文本 (词袋模型)
    cv = CountVectorizer(stop_words='english')
    X_train = cv.fit_transform(X_train)
    X_test = cv.transform(X_test)
    
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    # 构建多个分类器
    names = [
        'KNN', 
        'Logistic', 
        'SVM',
        'NaiveBayes', 
        'DecisionTree', 
        'RandomForest', 
        'XGBoost',
    ]
    classifiers = [
        KNeighborsClassifier(n_neighbors=3),
        LogisticRegression(),
        SVC(kernel='linear', probability=True),
        MultinomialNB(),
        DecisionTreeClassifier(),
        RandomForestClassifier(n_estimators=100),
        xgb.XGBClassifier(tree_method='hist'),
    ]
    
    # 批处理: 模型训练, 保存评估指标, 绘制 ROC 曲线
    ax = runplt()
    report = []
    
    for name, clf in zip(names, classifiers):
        # 模型训练
        start_time = time.time()
        clf.fit(X_train, y_train)
        duration = time.time() - start_time
        
        # 模型评估: 拟合度
        score_train = clf.score(X_train, y_train)
        score_test = clf.score(X_test, y_test)
        print('{} (耗时 {:.5f} 秒):\n  训练集准确率: {:.3f}\n  测试集准确率: {:.3f}'.format(name, duration, score_train, score_test))
        
        # 模型评估: 准确率/精确率/召回率/F1
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # 模型评估: 绘制 ROC 曲线并标注 AUC 值
        y_score = clf.predict_proba(X_test)
        if y_score.shape[1] == 2:
            y_score = y_score[:,1]
        fpr, tpr, thresholds = roc_curve(y_test, y_score)
        auc = roc_auc_score(y_test, y_score)
        ax.plot(fpr, tpr, label='{}: ${:.3f}$'.format(name, auc))

        # 将所有评估指标保存在 dataframe
        _ = {
            'classifier': name, 
            'duration': duration, 
            'accuracy_train': score_train, 
            'accuracy_test': score_test, 
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'AUC': auc,
        }
        report.append(_)

    # 打印 dataframe
    df = pd.DataFrame(report)
    df = df.sort_values(by='AUC', ascending=False)
    print(df)
    
    # 保存 ROC 曲线
    ax.legend(loc='best', frameon=True, fontsize='small')
    plt.savefig('ROC.svg')
    plt.show()
 

if __name__ == '__main__':
    main()

运行结果

指标意义详见:分类指标

打印


ROC

分享

Molly Wang
作者
Molly Wang
一个数据产品人的自我修养