您的位置：首页 > 编程语言 > Python开发

python包xgboost安装和简单使用

2017-12-22 20:31 901 查看

1、xgboost安装

首先，xgboost是python的一个包，用于数据分析以及boost的实现。

cp36代表python3.6。

numpy-1.13.1+mkl-cp36-cp36m-win_amd64.whl
scipy-0.19.1-cp36-cp36m-win_amd64.whl
xgboost-0.6-cp36-cp36m-win_amd64.whl

在这里-xgboost下载，然后在cmd，在下载文件的路径，运行：

F:\下载软件>pip3 install xgboost-0.6+20171121-cp36-cp36m-win_amd64.whl

注：一开始使用pip安装的时候，出错

xgboost-0.6+20171121-cp36-cp36m-win_amd64.whl is not a supported wheel on this platform.

一开始按照知乎专栏的文章安装了xgboost，但是加载不了模型，然后，再按照简书那篇文章，重新安装了xgboost。感觉，应该随意按照其中一篇文章，都可以安装成功。

2、xgboost调试

xgboost可以分成两种实现：1、原生接口；2、scikitlearn实现；

xgboost处理的问题分成两种：1、分类问题；2、回归问题

2.1 Demo1 基于XGBoost原生接口的分类

#xgboost
#分类
from sklearn.datasets import load_iris
import xgboost as xgb
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

# read in the iris data
iris = load_iris()

X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234565)

params = {
'booster': 'gbtree',
'objective': 'multi:softmax',
'num_class': 3,
'gamma': 0.1,
'max_depth': 6,
'lambda': 2,
'subsample': 0.7,
'colsample_bytree': 0.7,
'min_child_weight': 3,
'silent': 1,
'eta': 0.1,
'seed': 1000,
'nthread': 4,
}

plst = params.items()

dtrain = xgb.DMatrix(X_train, y_train)
num_rounds = 500
model = xgb.train(plst, dtrain, num_rounds)

# 对测试集进行预测
dtest = xgb.DMatrix(X_test)
ans = model.predict(dtest)

# 计算准确率
cnt1 = 0
cnt2 = 0
for i in range(len(y_test)):
if ans[i] == y_test[i]:
cnt1 += 1
else:
cnt2 += 1

print("Accuracy: %.2f %% " % (100 * cnt1 / (cnt1 + cnt2)))

# 显示重要特征
plot_importance(model)
plt.show()

运行结果：

2.2 Demo2-基于Scikit-learn接口的分类

#5.3 基于Scikit-learn接口的分类

from sklearn.datasets import load_iris
import xgboost as xgb
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

# read in the iris data
iris = load_iris()

X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 训练模型
model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=160, silent=True, objective='multi:softmax')
model.fit(X_train, y_train)

# 对测试集进行预测
ans = model.predict(X_test)

# 计算准确率
cnt1 = 0
cnt2 = 0
for i in range(len(y_test)):
if ans[i] == y_test[i]:
cnt1 += 1
else:
cnt2 += 1

print("Accuracy: %.2f %% " % (100 * cnt1 / (cnt1 + cnt2)))

# 显示重要特征
plot_importance(model)
plt.show()

以下两个demo是回归问题的实现：

2.3 demo3-基于Scikit-learn接口的回归

#5.4 基于Scikit-learn接口的回归

import xgboost as xgb
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import csv

# 读取文件原始数据

#读取数据方法1
# data = []
# labels = []
# labels2 = []
# with open(r"G:\0研究生\tianchiCompetition\训练小样本3_label.csv", encoding='UTF-8') as fileObject:
# for line in fileObject:
# line_split = line.split(',')
# data.append(line_split[10:])
# labels.append(line_split[8])

# X = []
# for row in data:
# row = [float(x) for x in row]
# X.append(row)

# y = [float(x) for x in labels]

#读取数据方法2
#####
X=[]
y=[]            #定义空列表
csvFile = open(r"G:\0研究生\tianchiCompetition\训练小样本3_label.csv", "r")
reader = csv.reader(csvFile)
for item in reader:
# if reader.line_num==1:
# continue
item=[float(ii) for ii in item]
X.append(item)

###把读取的数据转化成float格式

for i in range(len(X)):
y.append(X[i].pop())

# print('x',X)
# print('Y',Y)

# XGBoost训练过程
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=160, silent=True, objective='reg:gamma')
model.fit(X_train, y_train)

# 对测试集进行预测
ans = model.predict(X_test)

# 显示重要特征
plot_importance(model)
plt.show()

2.4基于XGBoost原生接口的回归

# 5.2 基于XGBoost原生接口的回归

import xgboost as xgb
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import csv

from xgboost import XGBClassifier
# # 读取文件原始数据
# data = []
# labels = []
# labels2 = []
# with open("lppz5.csv", encoding='UTF-8') as fileObject:
# for line in fileObject:
# line_split = line.split(',')
# data.append(line_split[10:])
# labels.append(line_split[8])

# X = []
# for row in data:
# row = [float(x) for x in row]
# X.append(row)

# y = [float(x) for x in labels]

#读取数据方法2
#####
X=[]
y=[]            #定义空列表
csvFile = open(r"G:\0研究生\tianchiCompetition\训练小样本4large_label.csv", "r")
reader = csv.reader(csvFile)
for item in reader:
# if reader.line_num==1:
# continue
item=[float(ii) for ii in item]
X.append(item)

###把读取的数据转化成float格式

for i in range(len(X)):
y.append(X[i].pop())

# XGBoost训练过程
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

params = {
'booster': 'gbtree',
'objective': 'reg:gamma',
'gamma': 0.1,
'max_depth': 5,
'lambda': 3,
'subsample': 0.7,
'colsample_bytree': 0.7,
'min_child_weight': 3,
'silent': 1,
'eta': 0.1,
'seed': 1000,
'nthread': 4,
}

dtrain = xgb.DMatrix(X_train, y_train)
# num_rounds = 300
num_rounds = 126
plst = params.items()
model = xgb.train(plst, dtrain, num_rounds)

# 对测试集进行预测
dtest = xgb.DMatrix(X_test)
ans = model.predict(dtest)

# 显示重要特征
plot_importance(model)
plt.show()

运行程序出错：

raise ValueError('Booster.get_score() results in empty')
ValueError: Booster.get_score() results in empty

现在，还不是很懂这是什么问题。

把读取的代码改了，

for i in range(len(X)):
y.append(X[i][-1])

就是X保留label值，运行没有问题，但是，运行结果却是：

这很奇怪，为什么只有最后一列（label）的feature，这样对我没有什么意义。

3、其他使用

定义model，获得gbtree.

###xgboost usage
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation
from matplotlib import pyplot as plt
from xgboost import plot_importance

train_df = pd.read_csv(r'G:\0研究生\tianchiCompetition\训练小样本.csv')
x_list=['210X1','220X2','310X2','311X72','261X237']
X_train = train_df[x_list].values
y_train = train_df.Y.values

X_dtrain, X_deval, y_dtrain, y_deval = cross_validation.train_test_split(X_train, y_train, random_state=1026, test_size=0.3)

dtrain = xgb.DMatrix(X_dtrain, y_dtrain)

deval = xgb.DMatrix(X_deval, y_deval)
watchlist = [(deval, 'eval')]
params = {
'booster': 'gbtree',
'objective': 'reg:linear',
'subsample': 0.8,
'colsample_bytree': 0.85,
'eta': 0.05,
'max_depth': 7,
'seed': 2016,
'silent': 0,
'eval_metric': 'rmse'
}
df_test=[100,1.5,1.5,90,95]
clf = xgb.train(params, dtrain, 50, watchlist, early_stopping_rounds=50)
print('X_dtrain',X_dtrain)#列表
print('dtrain\n',dtrain)
pred = clf.predict(xgb.DMatrix(df_test))
print(pred)

# 显示重要特征
plot_importance(clf)
plt.show()

参考：

xgboost: 速度快效果好的 boosting 模型；

xgboost基本入门_知乎专栏；

xgboost安装指南；

xgboost在Windows安装_简书

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航