您的位置:首页 > 编程语言 > Python开发

使用python+sklearn的决策树方法预测是否有信用风险

2016-09-21 11:01 543 查看
import numpy as np


import pandas as pd


names=("Balance,Duration,History,Purpose,Credit amount,Savings,Employment,instPercent,sexMarried,Guarantors,Residence duration,Assets,Age,concCredit,Apartment,Credits,Occupation,Dependents,hasPhone,Foreign,lable").split(',')


data=pd.read_csv("Desktop/sunshengyun/data/german/german.data",sep='\s+',names=names)


data.head()


BalanceDurationHistoryPurposeCredit amountSavingsEmploymentinstPercentsexMarriedGuarantorsAssetsAgeconcCreditApartmentCreditsOccupationDependentshasPhoneForeignlable
0A116A34A431169A65A754A93A101A12167A143A1522A1731A192A2011
1A1248A32A435951A61A732A92A101A12122A143A1521A1731A191A2012
2A1412A34A462096A61A742A93A101A12149A143A1521A1722A191A2011
3A1142A32A427882A61A742A93A103A12245A143A1531A1732A191A2011
4A1124A33A404870A61A733A93A101A12453A143A1532A1732A191A2012
5 rows × 21 columns

data.Balance.unique()


array([‘A11’, ‘A12’, ‘A14’, ‘A13’], dtype=object)

data.count()


Balance 1000
Duration 1000
History 1000
Purpose 1000
Credit amount 1000
Savings 1000
Employment 1000
instPercent 1000
sexMarried 1000
Guarantors 1000
Residence duration 1000
Assets 1000
Age 1000
concCredit 1000
Apartment 1000
Credits 1000
Occupation 1000
Dependents 1000
hasPhone 1000
Foreign 1000
lable 1000
dtype: int64

#部分变量描述性统计分析
data.describe()


DurationCredit amountinstPercentResidence durationAgeCreditsDependentslable
count1000.0000001000.0000001000.0000001000.0000001000.0000001000.0000001000.0000001000.000000
mean20.9030003271.2580002.9730002.84500035.5460001.4070001.1550001.300000
std12.0588142822.7368761.1187151.10371811.3754690.5776540.3620860.458487
min4.000000250.0000001.0000001.00000019.0000001.0000001.0000001.000000
25%12.0000001365.5000002.0000002.00000027.0000001.0000001.0000001.000000
50%18.0000002319.5000003.0000003.00000033.0000001.0000001.0000001.000000
75%24.0000003972.2500004.0000004.00000042.0000002.0000001.0000002.000000
max72.00000018424.0000004.0000004.00000075.0000004.0000002.0000002.000000
data.Duration.unique()


array([ 6, 48, 12, 42, 24, 36, 30, 15, 9, 10, 7, 60, 18, 45, 11, 27, 8,
54, 20, 14, 33, 21, 16, 4, 47, 13, 22, 39, 28, 5, 26, 72, 40], dtype=int64)

data.History.unique()


array([‘A34’, ‘A32’, ‘A33’, ‘A30’, ‘A31’], dtype=object)

data.groupby('Balance').size().order(ascending=False)


c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…)
if __name__ == ‘__main__’:

Balance
A14 394
A11 274
A12 269
A13 63
dtype: int64

data.groupby('Purpose').size().order(ascending=False)


c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…)
if __name__ == ‘__main__’:

Purpose
A43 280
A40 234
A42 181
A41 103
A49 97
A46 50
A45 22
A44 12
A410 12
A48 9
dtype: int64

data.groupby('Apartment').size().order(ascending=False)


c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…)
if __name__ == ‘__main__’:

Apartment
A152 713
A151 179
A153 108
dtype: int64

import matplotlib.pyplot as plt
%matplotlib inline
data.plot(x='lable', y='Age', kind='scatter',
alpha=0.02, s=50);


![png](output_13_0.png)

data.hist('Age', bins=15);


![png](output_14_0.png)

target=data.lable


features_data=data.drop('lable',axis=1)


numeric_features = [c for c in features_data if features_data[c].dtype.kind in ('i', 'f')] # 提取数值类型为整数或浮点数的变量


numeric_features


[‘Duration’,
‘Credit amount’,
‘instPercent’,
‘Residence duration’,
‘Age’,
‘Credits’,
‘Dependents’]

numeric_data = features_data[numeric_features]


numeric_data.head()


DurationCredit amountinstPercentResidence durationAgeCreditsDependents
061169446721
1485951222211
2122096234912
3427882244512
4244870345322
categorical_data = features_data.drop(numeric_features, axis=1)


categorical_data.head()


BalanceHistoryPurposeSavingsEmploymentsexMarriedGuarantorsAssetsconcCreditApartmentOccupationhasPhoneForeign
0A11A34A43A65A75A93A101A121A143A152A173A192A201
1A12A32A43A61A73A92A101A121A143A152A173A191A201
2A14A34A46A61A74A93A101A121A143A152A172A191A201
3A11A32A42A61A74A93A103A122A143A153A173A191A201
4A11A33A40A61A73A93A101A124A143A153A173A191A201
categorical_data_encoded = categorical_data.apply(lambda x: pd.factorize(x)[0]) # pd.factorize即可将分类变量转换为数值表示
# apply运算将转换函数应用到每一个变量维度
categorical_data_encoded.head(5)


BalanceHistoryPurposeSavingsEmploymentsexMarriedGuarantorsAssetsconcCreditApartmentOccupationhasPhoneForeign
00000000000000
11101110000010
22011200000110
30121201101010
40231100201010
features = pd.concat([numeric_data, categorical_data_encoded], axis=1)#进行数据的合并
features.head()
# 此处也可以选用one-hot编码来表示分类变量,相应的程序如下:
# features = pd.get_dummies(features_data)
# features.head()


DurationCredit amountinstPercentResidence durationAgeCreditsDependentsBalanceHistoryPurposeSavingsEmploymentsexMarriedGuarantorsAssetsconcCreditApartmentOccupationhasPhoneForeign
0611694467210000000000000
14859512222111101110000010
21220962349122011200000110
34278822445120121201101010
42448703453220231100201010
X = features.values.astype(np.float32) # 转换数据类型
y = (target.values == 1).astype(np.int32) # 1:good,2:bad


from sklearn.cross_validation import train_test_split # sklearn库中train_test_split函数可实现该划分

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=0) # 参数test_size设置训练集占比


from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score

clf = DecisionTreeClassifier(max_depth=8) # 参数max_depth设置树最大深度

# 交叉验证,评价分类器性能,此处选择的评分标准是ROC曲线下的AUC值,对应AUC更大的分类器效果更好
scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
print("ROC AUC Decision Tree: {:.4f} +/-{:.4f}".format(
np.mean(scores), np.std(scores)))


ROC AUC Decision Tree: 0.6866 +/-0.0105


#利用learning curve,以样本数为横坐标,训练和交叉验证集上的评分为纵坐标,对不同深度的决策树进行对比(判断是否存在过拟合或欠拟合)
from sklearn.learning_curve import learning_curve

def plot_learning_curve(estimator, X, y, ylim=(0, 1.1), cv=3,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5),
scoring=None):
plt.title("Learning curves for %s" % type(estimator).__name__)
plt.ylim(*ylim); plt.grid()
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, validation_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,
scoring=scoring)
train_scores_mean = np.mean(train_scores, axis=1)
validation_scores_mean = np.mean(validation_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, validation_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
print("Best validation score: {:.4f}".format(validation_scores_mean[-1]))


clf = DecisionTreeClassifier(max_depth=None)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')
# 可以注意到训练数据和交叉验证数据的得分有很大的差距,意味着可能过度拟合训练数据了


Best validation score: 0.6310




clf = DecisionTreeClassifier(max_depth=10)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')


Best validation score: 0.6565




clf = DecisionTreeClassifier(max_depth=8)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')


Best validation score: 0.6762




clf = DecisionTreeClassifier(max_depth=5)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')


Best validation score: 0.7219




clf = DecisionTreeClassifier(max_depth=4)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')


Best validation score: 0.7226




#利用validation_curve计算不同深度训练集和测试集交叉验证得分
from sklearn.learning_curve import validation_curve

def plot_validation_curve(estimator, X, y, param_name, param_range,
ylim=(0, 1.1), cv=3, n_jobs=1, scoring=None):
estimator_name = type(estimator).__name__
plt.title("Validation curves for %s on %s"
% (param_name, estimator_name))
plt.ylim(*ylim); plt.grid()
plt.xlim(min(param_range), max(param_range))
plt.xlabel(param_name)
plt.ylabel("Score")

train_scores, test_scores = validation_curve(
estimator, X, y, param_name, param_range,
cv=cv, n_jobs=n_jobs, scoring=scoring)

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.semilogx(param_range, train_scores_mean, 'o-', color="r",
label="Training score")
plt.semilogx(param_range, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
print("Best test score: {:.4f}".format(test_scores_mean[-1]))


clf = DecisionTreeClassifier(max_depth=8)
param_name = 'max_depth'
param_range = [1, 2, 3, 4, 5, 6,7,8,9,10,11,12,13,14,15]

plot_validation_curve(clf, X_train, y_train,
param_name, param_range, scoring='roc_auc')


Best test score: 0.6409




# 先利用随机森里来提升分类效果
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=27, max_features=15, # 参数n_estimators设置森林中树的个数
max_depth=10)

scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc',
n_jobs=1)
print("ROC Random Forest: {:.4f} +/-{:.4f}".format(
np.mean(scores), np.std(scores)))


ROC Random Forest: 0.7817 +/-0.0208


clf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=10, max_features=15, max_leaf_nodes=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=27, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)


from sklearn.metrics import roc_auc_score
y_pred_proba = clf.predict_proba(X_test)[:, 1]
print("ROC AUC: %0.4f" % roc_auc_score(y_test, y_pred_proba))


ROC AUC: 0.7394


from sklearn import grid_search
# 使用gridsearch进行并行调参
parameters = {'n_estimators':[5,11,15,21,25,31], 'max_features':[5, 10,15,20],'max_depth':[3,6,9,12],'criterion':['gini','entropy']}
clf = grid_search.GridSearchCV(RandomForestClassifier(), parameters, cv=3)
clf.fit(X_train, y_train)


GridSearchCV(cv=3, error_score='raise',
estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False),
fit_params={}, iid=True, n_jobs=1,
param_grid={'n_estimators': [5, 11, 15, 21, 25, 31], 'max_features': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], 'max_depth': [3, 6, 9, 12]},
pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)


y_pred_proba = clf.predict_proba(X_test)[:, 1]
print("ROC AUC: %0.4f" % roc_auc_score(y_test, y_pred_proba))


ROC AUC: 0.7551


clf.best_params_


{'criterion': 'entropy',
'max_depth': 6,
'max_features': 15,
'n_estimators': 21}


clf.best_score_


0.78374999999999995
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
相关文章推荐