集成学习自动权重设置python实现
2016-11-25 16:45
381 查看
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import os
os.system("ls ../input")
train = pd.read_csv("../input/train.csv")
print("Training set has {0[0]} rows and {0[1]} columns".format(train.shape))
labels = train['target']
train.drop(['target', 'id'], axis=1, inplace=True)
print(train.head())
### we need a test set that we didn't train on to find the best weights for combining the classifiers
sss = StratifiedShuffleSplit(labels, test_size=0.05, random_state=1234)
for train_index, test_index in sss:
break
train_x, train_y = train.values[train_index], labels.values[train_index]
test_x, test_y = train.values[test_index], labels.values[test_index]
### building the classifiers
clfs = []
rfc = RandomForestClassifier(n_estimators=50, random_state=4141, n_jobs=-1)
rfc.fit(train_x, train_y)
print('RFC LogLoss {score}'.format(score=log_loss(test_y, rfc.predict_proba(test_x))))
clfs.append(rfc)
### usually you'd use xgboost and neural nets here
logreg = LogisticRegression()
logreg.fit(train_x, train_y)
print('LogisticRegression LogLoss {score}'.format(score=log_loss(test_y, logreg.predict_proba(test_x))))
clfs.append(logreg)
rfc2 = RandomForestClassifier(n_estimators=50, random_state=1337, n_jobs=-1)
rfc2.fit(train_x, train_y)
print('RFC2 LogLoss {score}'.format(score=log_loss(test_y, rfc2.predict_proba(test_x))))
clfs.append(rfc2)
### finding the optimum weights
predictions = []
for clf in clfs:
predictions.append(clf.predict_proba(test_x))
def log_loss_func(weights):
''' scipy minimize will pass the weights as a numpy array '''
final_prediction = 0
for weight, prediction in zip(weights, predictions):
final_prediction += weight*prediction
return log_loss(test_y, final_prediction)
#the algorithms need a starting value, right not we chose 0.5 for all weights
#its better to choose many random starting points and run minimize a few times
starting_values = [0.5]*len(predictions)
#adding constraints and a different solver as suggested by user 16universe
#https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
#our weights are bound between 0 and 1
bounds = [(0,1)]*len(predictions)
res = minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)
print('Ensamble Score: {best_score}'.format(best_score=res['fun']))
print('Best Weights: {weights}'.format(weights=res['x']))
转载自 https://www.kaggle.com/hsperr/otto-group-product-classification-challenge/finding-ensamble-weights
import numpy as np
from scipy.optimize import minimize
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import os
os.system("ls ../input")
train = pd.read_csv("../input/train.csv")
print("Training set has {0[0]} rows and {0[1]} columns".format(train.shape))
labels = train['target']
train.drop(['target', 'id'], axis=1, inplace=True)
print(train.head())
### we need a test set that we didn't train on to find the best weights for combining the classifiers
sss = StratifiedShuffleSplit(labels, test_size=0.05, random_state=1234)
for train_index, test_index in sss:
break
train_x, train_y = train.values[train_index], labels.values[train_index]
test_x, test_y = train.values[test_index], labels.values[test_index]
### building the classifiers
clfs = []
rfc = RandomForestClassifier(n_estimators=50, random_state=4141, n_jobs=-1)
rfc.fit(train_x, train_y)
print('RFC LogLoss {score}'.format(score=log_loss(test_y, rfc.predict_proba(test_x))))
clfs.append(rfc)
### usually you'd use xgboost and neural nets here
logreg = LogisticRegression()
logreg.fit(train_x, train_y)
print('LogisticRegression LogLoss {score}'.format(score=log_loss(test_y, logreg.predict_proba(test_x))))
clfs.append(logreg)
rfc2 = RandomForestClassifier(n_estimators=50, random_state=1337, n_jobs=-1)
rfc2.fit(train_x, train_y)
print('RFC2 LogLoss {score}'.format(score=log_loss(test_y, rfc2.predict_proba(test_x))))
clfs.append(rfc2)
### finding the optimum weights
predictions = []
for clf in clfs:
predictions.append(clf.predict_proba(test_x))
def log_loss_func(weights):
''' scipy minimize will pass the weights as a numpy array '''
final_prediction = 0
for weight, prediction in zip(weights, predictions):
final_prediction += weight*prediction
return log_loss(test_y, final_prediction)
#the algorithms need a starting value, right not we chose 0.5 for all weights
#its better to choose many random starting points and run minimize a few times
starting_values = [0.5]*len(predictions)
#adding constraints and a different solver as suggested by user 16universe
#https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
#our weights are bound between 0 and 1
bounds = [(0,1)]*len(predictions)
res = minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)
print('Ensamble Score: {best_score}'.format(best_score=res['fun']))
print('Best Weights: {weights}'.format(weights=res['x']))
转载自 https://www.kaggle.com/hsperr/otto-group-product-classification-challenge/finding-ensamble-weights
相关文章推荐
- 【机器学习入门二】集成学习及AdaBoost算法的python实现
- python学习之路之案例0(实现登录功能,登录错误次数超过3次,自动退出登录)
- 集成学习AdaBoost算法原理及python实现
- 贝叶斯分类方法学习三 python+jieba+mongodb实现朴素贝叶斯新闻文本自动分类
- python-IDE-spyder自动设置时间,作者、联系方式的方法--python学习笔记27
- SpringBoot学习笔记(5) Spring Boot集成Redis实现自动配置
- Python的学习(二十三)---python实现网站自动登录
- ASP.NET MVC学习笔记 -- NerdDinner实战1 -- 中文Visual Studio 2008的设置修改以实现复数表单数类名
- 发现shedskin的example是学习算法的好材料(Python实现)
- 通过crond自动运行Python脚本实现多台linux服务器的监控
- 一个无聊男人的疯狂《数据结构与算法分析-C++描述》学习笔记 用C++/lua/python/bash的四重实现(5)欧几里得算法欧几里得算法求最大公约数
- 将C++代码全部写到头文件:)python脚本帮助自动生成相应的实现文件初始框架
- 一个无聊男人的疯狂《数据结构与算法分析-C++描述》学习笔记 用C++/lua/python/bash的四重实现(7)习题2.8 随机数组的三种生成算法
- (原创)用python实现自动扫雷机
- 如何设置路由器并利用路由器+宽带猫实现单机或是多机共享自动拨号上网
- 一个无聊男人的疯狂《数据结构与算法分析-C++描述》学习笔记 用C++/lua/python/bash的四重实现(6)高效率的幂运算
- OpenCV学习——物体跟踪的粒子滤波算法实现之权重归一化
- Atlas学习手记(4):使用AutoComplete Extender实现自动完成功能
- 在iframe中使用js代码实现自动设置栽入的页面的高度自动化
- python设置检查点简单实现