您的位置:首页 > 其它

机器学习算法-线性回归打卡

2020-06-05 08:24 162 查看

这次打卡的一点反思总结:
  1.调用LinearRegression库比较简单,矩阵公式法主要熟悉了一下numpy的几个矩阵运算公式,计算过程也比较顺利
  2.梯度下降法遇到的问题主要是运算过程中整体迭代用np.sum函数时出现了数值过大溢出报错,若用例子中给的np.mean函数则没有问题
  3.通过mini-batch迭代方法运算也可将数值过大溢出的问题顺利解决,此部分代码中有展示,以下是本次打卡代码部分
  4.房价预测选取的是波士顿房价数据集,最后总感觉得分不是很高,试过几种不同的训练模型,测试集r2最高得分大概在0.74左右 ,看以后能不能再将精度提升吧。

import timeimport numpy as np#生成随机数np.random.seed(0)x = np.random.rand(1000,3)#构建映射关系,模拟真实的数据待预测值,映射关系为y = 4.5 + 5.6*x1 + 6.7*x2,即参数为【4.5,5.6,6.7】y = x.dot(np.array([4.5,5.6,6.7]))

1.调用LinearRegression库训练模型、计算参数、预测数据

import numpy as npfrom sklearn.linear_model import LinearRegression, SGDRegressorimport matplotlib.pyplot as plt%matplotlib inlinetime1=time.time()# 调用模型lr = LinearRegression(fit_intercept=True)# 训练模型lr.fit(x,y)print("估计的参数值为:%s" %(lr.coef_))# 计算R平方print('R2:%s' %(lr.score(x,y)))# 任意设定变量,预测目标值x_test = np.array([3,4,5]).reshape(1,-1)y_hat = lr.predict(x_test)print("预测值为: %s" %(y_hat))print(time.time()-time1)
估计的参数值为:[4.5 5.6 6.7]R2:1.0预测值为: [69.4]0.10936808586120605

2.通过最小二乘法的矩阵运算求解参数,预测数据

time1=time.time()class LR_LS():def __init__(self):self.w = Nonedef fit(self, X, y):# 最小二乘法矩阵求解self.w = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)def predict(self, X):# 用已经拟合的参数值预测新自变量y_pred = X.dot(self.w)return y_predif __name__ == "__main__":lr_ls = LR_LS()lr_ls.fit(x,y)print("估计的参数值:%s" %(lr_ls.w))x_test = np.array([3,4,5]).reshape(1,-1)print("预测值为: %s" %(lr_ls.predict(x_test)))print(time.time()-time1)
估计的参数值:[4.5 5.6 6.7]预测值为: [69.4]0.003999471664428711

3.mini-batch梯度下降法求解参数,预测数据

time1=time.time()class LR_GD():def __init__(self):self.w = None#参数变量self.k=0#mini-batch下标def fit(self,X,y,alpha=0.02,loss = 1e-10): # 设定步长为0.002,判断是否收敛的条件为1e-10y = y.reshape(-1,1) #重塑y值的维度以便矩阵运算[m,d] = np.shape(X) #自变量的维度self.w = np.zeros((d)) #将参数的初始值定为0tol = 1e5while tol > loss:if self.k<len(y):h_f = X[self.k:self.k+80,:].dot(self.w).reshape(-1,1)theta = self.w + alpha*np.sum(X[self.k:self.k+80]*(y[self.k:self.k+80] - h_f),axis=0) #计算迭代的参数值tol = np.sum(np.abs(theta - self.w))self.w = thetaself.k+=80else: self.k=0def predict(self, X):# 用已经拟合的参数值预测新自变量y_pred = X.dot(self.w)return y_predif __name__ == "__main__":lr_gd = LR_GD()lr_gd.fit(x,y)print("估计的参数值为:%s" %(lr_gd.w))x_test = np.array([3,4,5]).reshape(1,-1)print("预测值为:%s" %(lr_gd.predict(x_test)))print(time.time()-time1)
估计的参数值为:[4.5 5.6 6.7]预测值为:[69.4]0.02803659439086914

4.预测波士顿房价

from sklearn.datasets import load_bostonfrom sklearn.preprocessing import StandardScalerfrom sklearn.model_selection import train_test_splitfrom sklearn.linear_model import SGDRegressor# 加载数据boston = load_boston()# 训练集,测试集拆分X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.25)# 数据标准化处理# 特征值 标准化std_x = StandardScaler()X_train = std_x.fit_transform(X_train)X_test = std_x.transform(X_test)# 目标值 标准化std_y = StandardScaler()y_train = std_y.fit_transform(y_train.reshape(-1, 1))y_test = std_y.transform(y_test.reshape(-1, 1))

(1)随机梯度下降法

time1=time.time()# 调用模型lr=SGDRegressor(penalty='l2',max_iter=3000)# 训练模型lr.fit(X_train,y_train.ravel())print("估计的参数值为:%s" %(lr.coef_))# 计算R平方print('R2:%s' %(lr.score(X_train,y_train)))# 任意设定变量,预测目标值print(time.time()-time1)print('R2:%s' %(lr.score(X_test,y_test)))
估计的参数值为:[-8.77432412e-02  8.01666908e-02 -2.56564353e-02  9.97848814e-02-1.83553987e-01  2.72407144e-01 -3.10643292e-04 -2.82239516e-012.05851012e-01 -1.02669531e-01 -2.23618152e-01  8.48913007e-02-3.91006815e-01]R2:0.73470830691846010.008996248245239258R2:0.7422192182734759

(2)Lasso线性回归

from sklearn.linear_model import Lassotime1=time.time()# 调用模型lr=Lasso(alpha=0.15)# 训练模型lr.fit(X_train,y_train.ravel())print("估计的参数值为:%s" %(lr.coef_))# 计算R平方print('R2:%s' %(lr.score(X_train,y_train)))# 任意设定变量,预测目标值print(time.time()-time1)print('R2:%s' %(lr.score(X_test,y_test)))
估计的参数值为:[-0.          0.         -0.          0.         -0.          0.24192003-0.         -0.         -0.         -0.         -0.14165717  0.-0.38271794]R2:0.63711496475439790.007995843887329102R2:0.6852038681235246

(3)岭回归

from sklearn.linear_model import Ridgetime1=time.time()# 调用模型lr=Ridge(alpha=1.5, solver='auto')# 训练模型lr.fit(X_train,y_train.ravel())print("估计的参数值为:%s" %(lr.coef_))# 计算R平方print('R2:%s' %(lr.score(X_train,y_train)))# 任意设定变量,预测目标值print(time.time()-time1)print('R2:%s' %(lr.score(X_test,y_test)))
估计的参数值为:[-0.10723503  0.10814213  0.01352789  0.09787675 -0.24373632  0.256708620.01502854 -0.31017672  0.33230503 -0.23563901 -0.23956068  0.08607204-0.41032024]R2:0.74063337516156620.006994962692260742R2:0.7294482269639491

(4)决策树

from sklearn.tree import DecisionTreeRegressorfrom sklearn.model_selection import GridSearchCVfrom sklearn.model_selection import ShuffleSplitfrom sklearn.metrics import make_scorerdef fit_model(X, y):cv_sets = ShuffleSplit(n_splits=10, test_size=0.20, random_state=0)regressor = DecisionTreeRegressor(random_state = 10)params = {"max_depth":range(1,11)}scoring_fnc = make_scorer(r2_score)grid = GridSearchCV(regressor,params,scoring_fnc,cv=cv_sets)grid = grid.fit(X, y)return grid.best_estimator_# 5把训练集带入模型,得出最优模型reg = fit_model(X_train, y_train)print("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))# 6计算测试集在该模型上的得分y_predict = reg.predict(X_test)r2 = r2_score(y_test,y_predict)print("Optimal model has R^2 score {:,.2f} on test data".format(r2))
Parameter 'max_depth' is 4 for the optimal model.Optimal model has R^2 score 0.60 on test data

                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: