您的位置:首页 > 其它

算法实践(三)【任务2 - 模型评估】

2018-12-27 17:46 330 查看

【任务2 - 模型评估】
记录7个模型(在Task1的基础上)关于accuracy、precision,recall和F1-score、auc值的评分表格,并画出Roc曲线

# 加载库
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import matplotlib.pyplot as plt

# 读取数据
data_all = pd.read_csv('data_all.csv', encoding='gbk')

# 划分数据集
x = data_all.drop(columns=["status"]).as_matrix()
y = data_all[["status"]].as_matrix()
y = y.ravel()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018)

# 归一化处理
scaler = StandardScaler()
scaler.fit(x_train)
x_train_standard = scaler.transform(x_train)
x_test_standard = scaler.transform(x_test)

# 定义评分函数
def get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba):
train_accuracy = metrics.accuracy_score(y_train, y_train_predict)
test_accuracy = metrics.accuracy_score(y_test, y_test_predict)
# 精准率
train_precision = metrics.precision_score(y_train, y_train_predict)
test_precision = metrics.precision_score(y_test, y_test_predict)
# 召回率
train_recall = metrics.recall_score(y_train, y_train_predict)
test_recall = metrics.recall_score(y_test, y_test_predict)
# F1-score
train_f1_score = metrics.f1_score(y_train, y_train_predict)
test_f1_score = metrics.f1_score(y_test, y_test_predict)
# AUC
train_auc = metrics.roc_auc_score(y_train, y_train_proba)
test_auc = metrics.roc_auc_score(y_test, y_test_proba)
# ROC
train_fprs, train_tprs, train_thresholds = metrics.roc_curve(y_train, y_train_proba)
test_fprs, test_tprs, test_thresholds = metrics.roc_curve(y_test, y_test_proba)
plt.plot(train_fprs, train_tprs)
plt.plot(test_fprs, test_tprs)
plt.title("ROC Curve")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.show()
# 输出评分
print("训练集准确率:", train_accuracy)
print("测试集准确率:", test_accuracy)
print("训练集精准率:", train_precision)
print("测试集精准率:", test_precision)
print("训练集召回率:", train_recall)
print("测试集召回率:", test_recall)
print("训练集F1-score:", train_f1_score)
print("测试集F1-score:", test_f1_score)
print("训练集AUC:", train_auc)
print("测试集AUC:", test_auc)

# 逻辑回归
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=2018)
lr.fit(x_train_standard, y_train)
y_train_predict = lr.predict(x_train_standard)
y_test_predict = lr.predict(x_test_standard)
y_train_proba = lr.predict_proba(x_train_standard)[:, 1]
y_test_proba = lr.predict_proba(x_test_standard)[:, 1]
get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)

# SVM
from sklearn.svm import LinearSVC
svm_linearSVC = LinearSVC(random_state=2018)
svm_linearSVC.fit(x_train_standard, y_train)
y_train_predict = svm_linearSVC.predict(x_train_standard)
y_test_predict = svm_linearSVC.predict(x_test_standard)
y_train_proba = svm_linearSVC.decision_function(x_train_standard)
y_test_proba = svm_linearSVC.decision_function(x_test_standard)
get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)

# 决策树
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=2018)
tree.fit(x_train_standard, y_train)
y_train_predict = tree.predict(x_train_standard)
y_test_predict = tree.predict(x_test_standard)
y_train_proba = tree.predict_proba(x_train_standard)[:, 1]
y_test_proba = tree.predict_proba(x_test_standard)[:, 1]
get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)

# 随机森林
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=2018)
rf.fit(x_train_standard, y_train)
y_train_predict = rf.predict(x_train_standard)
y_test_predict = rf.predict(x_test_standard)
y_train_proba = rf.predict_proba(x_train_standard)[:, 1]
y_test_proba = rf.predict_proba(x_test_standard)[:, 1]
get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)

# GBDT
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state=2018)
gb.fit(x_train_standard, y_train)
y_train_predict = gb.predict(x_train_standard)
y_test_predict = gb.predict(x_test_standard)
y_train_proba = gb.predict_proba(x_train_standard)[:, 1]
y_test_proba = gb.predict_proba(x_test_standard)[:, 1]
get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)

# XGBoost
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=2018)
xgb.fit(x_train_standard, y_train)
y_train_predict = xgb.predict(x_train_standard)
y_test_predict = xgb.predict(x_test_standard)
y_train_proba = xgb.predict_proba(x_train_standard)[:, 1]
y_test_proba = xgb.predict_proba(x_test_standard)[:, 1]
get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)

# LightGBM
from lightgbm import LGBMClassifier
lg = LGBMClassifier(random_state=2018)
lg.fit(x_train_standard, y_train)
y_train_predict = lg.predict(x_train_standard)
y_test_predict = lg.predict(x_test_standard)
y_train_proba = lg.predict_proba(x_train_standard)[:, 1]
y_test_proba = lg.predict_proba(x_test_standard)[:, 1]
get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)

运行结果:

1.逻辑回归
训练集准确率: 0.8049293657950105
测试集准确率: 0.7876664330763841
训练集精准率: 0.7069351230425056
测试集精准率: 0.6609195402298851
训练集召回率: 0.37889688249400477
测试集召回率: 0.3203342618384401
训练集F1-score: 0.4933645589383294
测试集F1-score: 0.4315196998123827
训练集AUC: 0.8198240444948492
测试集AUC: 0.7657402480882185

2.svm
训练集准确率: 0.8025247971145176
测试集准确率: 0.7813594954449895
训练集精准率: 0.7251908396946565
测试集精准率: 0.6535947712418301
训练集召回率: 0.34172661870503596
测试集召回率: 0.2785515320334262
训练集F1-score: 0.4645476772616136
测试集F1-score: 0.390625
训练集AUC: 0.8210514620794339
测试集AUC: 0.7677954784931093

3.决策树
训练集准确率: 1.0
测试集准确率: 0.6853538892782061
训练集精准率: 1.0
测试集精准率: 0.38402061855670105
训练集召回率: 1.0
测试集召回率: 0.415041782729805
训练集F1-score: 1.0
测试集F1-score: 0.3989290495314592
训练集AUC: 1.0
测试集AUC: 0.5956295055971123

4.随机森林
训练集准确率: 1.0
测试集准确率: 0.779957953749124
训练集精准率: 1.0
测试集精准率: 0.6691729323308271
训练集召回率: 1.0
测试集召回率: 0.2479108635097493
训练集F1-score: 1.0
测试集F1-score: 0.36178861788617883
训练集AUC: 1.0
测试集AUC: 0.7489736888777607

5.GBDT
训练集准确率: 0.8623384430417794
测试集准确率: 0.7806587245970568
训练集精准率: 0.8836734693877552
测试集精准率: 0.6116504854368932
训练集召回率: 0.5191846522781774
测试集召回率: 0.35097493036211697
训练集F1-score: 0.6540785498489425
测试集F1-score: 0.44601769911504424
训练集AUC: 0.9207464353427005
测试集AUC: 0.7633146589047812

6.XGBoost
训练集准确率: 0.8539224526600541
测试集准确率: 0.7841625788367204
训练集精准率: 0.875
测试集精准率: 0.624390243902439
训练集召回率: 0.486810551558753
测试集召回率: 0.3565459610027855
训练集F1-score: 0.6255778120184899
测试集F1-score: 0.45390070921985815
训练集AUC: 0.9174710772897927
测试集AUC: 0.7708522424963224

7.LightGBM
训练集准确率: 0.9957920048091373
测试集准确率: 0.7701471618780659
训练集精准率: 1.0
测试集精准率: 0.5688888888888889
训练集召回率: 0.9832134292565947
测试集召回率: 0.3565459610027855
训练集F1-score: 0.9915356711003627
测试集F1-score: 0.4383561643835616
训练集AUC: 0.9999826853318788
测试集AUC: 0.7535210165566023

ROC曲线:

  1. 逻辑回归
  2. svm
  3. 决策树
  4. 随机森林
  5. GBDT
  6. XGBoost
  7. LightGBM
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: