您的位置：首页 > 编程语言 > Python开发

Python 机器学习及实践 Coded One :使用经典的分类模型和回归模型对数据进行训练预测评估 PS 欢迎指点指明错误！！！！...

2020-02-12 10:03 716 查看

^(*￣(oo)￣)^：1.有部分代码我进行了数据归一化操作（也叫数据标准化）在评估的时候使用的inverse_transform函数把数据还原

2.code的代码是按书中的顺序先进行了数据抽样（split）然后进行了归一化操作（StandardScaler）

先进行数据抽样会是数据的比例发生改变再进行归一化操作这样是不当的

正常情况应该先进行归一化操作然后再进行数据抽样

PartOne经典分类模型（做选择题：比如：判断是A类还是B类）

使用线性分类模型从事良/恶性肿瘤预测任务（LogisticRegression和SGDClassifiler）

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import stochastic_gradient
from sklearn.metrics import classification_report
column_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',names=column_names)

print(data.isnull)

data = data.replace(to_replace='?', value=np.nan)
data = data.dropna(how='any')

print(data.shape)

x_train, x_test, y_train, y_test = train_test_split(data[column_names[1:10]], data[column_names[10]], test_size=0.25, random_state=33)
print(y_train.value_counts())
print(y_test.value_counts())

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

lr = LogisticRegression()
lr.fit( x_train ,y_train)
lr_y_predict = lr.predict(x_test)
print('Accuracy of LR ClassifierL:', lr.score(x_test, y_test))
print(classification_report(y_test, lr_y_predict,target_names=['Benign', 'Malignant']))

sgdc = stochastic_gradient.SGDClassifier()
sgdc.fit( x_train  ,y_train)
sgdc_y_predict = sgdc.predict(x_test)

print('Accuracy of SGD ClassifierL:', sgdc.score(x_test, y_test))
print(classification_report(y_test, sgdc_y_predict,target_names=['Benign', 'Malignant']))

对手写数码图像识别（分类）模型（支持向量机）

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

digits = load_digits()
print( digits.data.shape)
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.25, random_state=33)
print(y_train.shape)
print(y_test.shape)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

lsvc=LinearSVC()
lsvc.fit(x_train, y_train)
y_predict=lsvc.predict(x_test)

print('Accuracy of Liner SVC is:', lsvc.score(x_test, y_test))
print(classification_report(y_test, y_predict,target_names=digits.target_names.astype(str)))

新闻文本分类（朴素贝叶斯）

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

news=fetch_20newsgroups(subset='all')
print(len(news.data))
print(news.data[0])

x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=33)

vec=CountVectorizer()
x_train=vec.fit_transform(x_train)
x_test=vec.transform(x_test)

mnb=MultinomialNB()
mnb.fit(x_train,y_train)
y_predict=mnb.predict(x_test)

print('Accuracy of Naive Bayes Classifier is:', mnb.score(x_test, y_test))
print(classification_report(y_test, y_predict,target_names=news.target_names))

对鸢尾花（lris）数据进行类别预测（K近邻分类）

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.metrics import classification_report

iris=load_iris()
print(iris.data.shape)

print(iris.DESCR)

x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=33)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

knc=KNeighborsClassifier()
knc.fit(x_train, y_train)
y_predict=knc.predict(x_test)

print('Accuracy of K-nearest Neighbour Classifier is:', knc.score(x_test, y_test))
print(classification_report(y_test, y_predict,target_names=iris.target_names))

对泰坦尼克号乘客的生还情况预测（决策树）

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
titanic.head()

titanic.info()

x=titanic[['pclass','age','sex']]
y=titanic['survived']

x.info()

x['age'].fillna(x['age'].mean(),inplace=True)

x.info()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)

vec=DictVectorizer(sparse=False)

x_train=vec.fit_transform(x_train.to_dict(orient='record'))
print(vec.feature_names_)

x_test=vec.fit_transform(x_test.to_dict(orient='record'))

dtc=DecisionTreeClassifier()
dtc.fit(x_train, y_train)
y_predict=dtc.predict(x_test)

print(dtc.score(x_test,y_test))
print(classification_report(y_test, y_predict,target_names = ['died', 'survived']))

对泰坦尼克号乘客的生还情况预测（集成模型（分类）：随机森林分类器和梯度提升决策树）

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')

x=titanic[['pclass','age','sex']]
y=titanic['survived']

x['age'].fillna(x['age'].mean(),inplace=True)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)

vec=DictVectorizer(sparse=False)
x_train=vec.fit_transform(x_train.to_dict(orient='record'))
x_test=vec.fit_transform(x_test.to_dict(orient='record'))

dtc=DecisionTreeClassifier()
dtc.fit(x_train, y_train)
dtc_y_predict=dtc.predict(x_test)

rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
rfc_y_predict=rfc.predict(x_test)

gbc=GradientBoostingClassifier()
gbc.fit(x_train,y_train)
gbc_y_predict=gbc.predict(x_test)

print('Accuracy of decision tree is:', dtc.score(x_test, y_test))
print(classification_report(dtc_y_predict,y_test))

print('Accuracy of random forest classifier is:', rfc.score(x_test, y_test))
print(classification_report(rfc_y_predict,y_test))

print('Accuracy of gradient tree classifier is:', gbc.score(x_test, y_test))
print(classification_report(gbc_y_predict,y_test))

PartTwo经典回归模型（做计算题：比如：计算某个问题的数值）

使用线性回归器对房屋价格进行预测LinearRegression和Stochastic_Gradient

代码后面的inverse_transform函数的作用是把归一化的数据还原

中间要对标签集进行reshape

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import stochastic_gradient
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
import numpy as np

boston =load_boston()
print (boston.DESCR)#查看数据描述

x=boston.data
y=boston.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)#分割数据
# 注：应该先进性归一化然后再进行样本抽样  此代码中顺序是相反的（根据书上）
print("The max target value is",np.max(boston.target))
print("The min target value is",np.min(boston.target))
print("The average target value is ",np.mean(boston.target))#输出标签集的最大值最小值 平均值

#预测目标之间相差较大 进行标准化处理
ss_X = StandardScaler().fit(x)
ss_y = StandardScaler().fit(y)
#ss_X = StandardScaler()
#ss_y = StandardScaler()

x_train = ss_X.fit_transform(x_train)
x_test = ss_X.transform(x_test)

y_train = ss_y.fit_transform(y_train.reshape(-1,1))
y_test = ss_y.transform(y_test.reshape(-1,1))
#y_train = ss_y.fit_transform(y_train)
#y_test = ss_y.fit_transform(y_test)

#使用线性回归器对房价进行预测（）
lr = LinearRegression()
lr.fit( x_train ,y_train)
lr_y_predict = lr.predict(x_test)

sgdr = stochastic_gradient.SGDRegressor()
sgdr.fit( x_train  ,y_train)
sgdr_y_predict = sgdr.predict(x_test)

print('The value of default measurement of LinearRegression is',lr.score(x_test,y_test))#线性回归模型自带的评估模块
print('The value of R-squred of LinearRegression is',r2_score(y_test,lr_y_predict))#回归问题的评价指标
print('The mean squred error of LinearRegression is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predict)))#平放误差
print('The mean absoluate error of LinearRegression is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predict)))#绝对平放误差

print('The value of default measurement of Regressor  is',sgdr.score(x_test,y_test))#线性回归模型自带的评估模块
print('The value of R-squred of  is',r2_score(y_test,sgdr_y_predict))#回归问题的评价指标
print('The mean squred error of LinearRegression is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))#平放误差
print('The mean absoluate error of LinearRegression is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))#绝对平放误差

以三种不同的核函数来使用支持向量机模型对房屋价格进行预测

^(*￣(oo)￣)^：我第一遍打代码的时候没有使用对数据进行归一化操作

其中核函数linear 没有受到影响

但是使用和函数poly训练没有归一化的数据会卡住

核函数rbf的预测准确程度会大幅度下降

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

boston = load_boston()
x = boston.data
y = boston.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)

ss_x = StandardScaler()
ss_y = StandardScaler()

x_train = ss_x.fit_transform(x_train)
x_test = ss_x.transform(x_test)
y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
y_test = ss_y.transform(y_test.reshape(-1, 1))

linear_svr=SVR(kernel='linear')
linear_svr.fit(x_train,y_train)
linear_svr_y_predict=linear_svr.predict(x_test)

poly_svr = SVR(kernel='poly')
poly_svr.fit(x_train, y_train.ravel())
poly_svr_y_predict = poly_svr.predict(x_test)

rbf_svr=SVR(kernel='rbf')
rbf_svr.fit(x_train,y_train)
rbf_svr_y_predict=rbf_svr.predict(x_test)

print('I AM Linear_SVR')
print('score',linear_svr.score(x_test,y_test))
print('R-squared',r2_score(y_test,linear_svr_y_predict))
print('mean squared',mean_squared_error(y_test,linear_svr_y_predict))
print('mean absolute',mean_absolute_error(y_test,linear_svr_y_predict))

print('I AM POLY_SVR')
print('score',poly_svr.score(x_test,y_test))
print('R-squared',r2_score(y_test,poly_svr_y_predict))
print('mean squared',mean_squared_error(y_test,poly_svr_y_predict))
print('mean absolute',mean_absolute_error(y_test,poly_svr_y_predict))

print('I AM RBF_SVR')
print('score',rbf_svr.score(x_test,y_test))
print('R-squared',r2_score(y_test,rbf_svr_y_predict))
print('mean squared',mean_squared_error(y_test,rbf_svr_y_predict))
print('mean absolute',mean_absolute_error(y_test,rbf_svr_y_predict))

使用两种不同配置的k近邻回归模型对房间进行预测（普通算数平均算法和加权平均）

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#________________________________________________________________________________________
from sklearn.neighbors import KNeighborsRegressor#k近邻回归器

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

boston = load_boston()
x = boston.data
y = boston.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)

ss_x = StandardScaler()
ss_y = StandardScaler()

x_train = ss_x.fit_transform(x_train)
x_test = ss_x.transform(x_test)
y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
y_test = ss_y.transform(y_test.reshape(-1, 1))

#________________________________________________________________________________________
uni_knr=KNeighborsRegressor(weights='uniform')#初始化K近邻回归器 并设置预测方式为 平均回归
uni_knr.fit(x_train,y_train)
uni_knr_y_predict=uni_knr.predict(x_test)

dis_knr=KNeighborsRegressor(weights='distance')#初始化K近邻回归器 并设置预测方式为 加权回归
dis_knr.fit(x_train,y_train)
dis_knr_knr_y_predict=dis_knr.predict(x_test)

print('I AM uni_knr')
print('score',uni_knr.score(x_test,y_test))
print('R-squared',r2_score(y_test,uni_knr_y_predict))
print('mean squared',mean_squared_error(y_test,uni_knr_y_predict))
print('mean absolute',mean_absolute_error(y_test,uni_knr_y_predict))

print('I AM dis_knr')
print('score',dis_knr.score(x_test,y_test))
print('R-squared',r2_score(y_test,dis_knr_knr_y_predict))
print('mean squared',mean_squared_error(y_test,dis_knr_knr_y_predict))
print('mean absolute',mean_absolute_error(y_test,dis_knr_knr_y_predict))

使用单一回归树对房价进行预测

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#_________________________________________________________

from sklearn.tree import DecisionTreeRegressor#导入回归树模型

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

boston = load_boston()
x = boston.data
y = boston.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)

ss_x = StandardScaler()
ss_y = StandardScaler()

x_train = ss_x.fit_transform(x_train)
x_test = ss_x.transform(x_test)
y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
y_test = ss_y.transform(y_test.reshape(-1, 1))

#____________________________________________________________
dtr=DecisionTreeRegressor()
dtr.fit(x_train,y_train)
dtr_y_predict=dtr.predict(x_test)

print('I AM DecisionTreeRegressor')
print('score',dtr.score(x_test,y_test))
print('R-squared',r2_score(y_test,dtr_y_predict))
print('mean squared',mean_squared_error(y_test,dtr_y_predict))
print('mean absolute',mean_absolute_error(y_test,dtr_y_predict))

使用三种集成回归模型对房价进行预测（RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor）

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#_______________________________________________________________________________________

from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

boston = load_boston()
x = boston.data
y = boston.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)

ss_x = StandardScaler()
ss_y = StandardScaler()

x_train = ss_x.fit_transform(x_train)
x_test = ss_x.transform(x_test)
y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
y_test = ss_y.transform(y_test.reshape(-1, 1))

rfr=RandomForestRegressor()
rfr.fit(x_train,y_train)
rfr_y_predict=rfr.predict(x_test)

etr=ExtraTreesRegressor()
etr.fit(x_train,y_train)
etr_y_predict=etr.predict(x_test)

gbr=GradientBoostingRegressor()
gbr.fit(x_train,y_train)
gbr_y_predict=gbr.predict(x_test)

print('I AM RandomForestRegressor')
print('score',rfr.score(x_test,y_test))
print('R-squared',r2_score(y_test,rfr_y_predict))
print('mean squared',mean_squared_error(y_test,rfr_y_predict))
print('mean absolute',mean_absolute_error(y_test,rfr_y_predict))

print('I AM ExtraTreesRegressor')
print('score',etr.score(x_test,y_test))
print('R-squared',r2_score(y_test,etr_y_predict))
print('mean squared',mean_squared_error(y_test,etr_y_predict))
print('mean absolute',mean_absolute_error(y_test,etr_y_predict))

print('I AM GradientBoostingRegressor')
print('score',gbr.score(x_test,y_test))
print('R-squared',r2_score(y_test,gbr_y_predict))
print('mean squared',mean_squared_error(y_test,gbr_y_predict))
print('mean absolute',mean_absolute_error(y_test,gbr_y_predict))

转载于:https://www.cnblogs.com/IAMzhuxiaofeng/p/8808780.html

点赞
收藏
分享
文章举报

azvvar6169 发布了0 篇原创文章 · 获赞 0 · 访问量 178 私信关注

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航

Python 机器学习及实践 Coded One :使用经典的分类模型和回归模型对数据进行训练 预测 评估 PS 欢迎指点指明错误！！！！...

Python 机器学习及实践 Coded One :使用经典的分类模型和回归模型对数据进行训练预测评估 PS 欢迎指点指明错误！！！！...