python scikit learn 模板
2016-06-27 09:46
405 查看
原文:
http://blog.csdn.net/zouxy09/article/details/48903179
代码如下:
http://blog.csdn.net/zouxy09/article/details/48903179
代码如下:
#!usr/bin/env python # -*- coding: utf-8 -*- import sys import os import time from sklearn import metrics import numpy as np import cPickle as pickle reload(sys) sys.setdefaultencoding('utf8') # Multinomial Naive Bayes Classifier def naive_bayes_classifier(train_x, train_y): from sklearn.naive_bayes import MultinomialNB model = MultinomialNB(alpha=0.01) model.fit(train_x, train_y) return model # KNN Classifier def knn_classifier(train_x, train_y): from sklearn.neighbors import KNeighborsClassifier model = KNeighborsClassifier() model.fit(train_x, train_y) return model # Logistic Regression Classifier def logistic_regression_classifier(train_x, train_y): from sklearn.linear_model import LogisticRegression model = LogisticRegression(penalty='l2') model.fit(train_x, train_y) return model # Random Forest Classifier def random_forest_classifier(train_x, train_y): from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_estimators=8) model.fit(train_x, train_y) return model # Decision Tree Classifier def decision_tree_classifier(train_x, train_y): from sklearn import tree model = tree.DecisionTreeClassifier() model.fit(train_x, train_y) return model # GBDT(Gradient Boosting Decision Tree) Classifier def gradient_boosting_classifier(train_x, train_y): from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier(n_estimators=200) model.fit(train_x, train_y) return model # SVM Classifier def svm_classifier(train_x, train_y): from sklearn.svm import SVC model = SVC(kernel='rbf', probability=True) model.fit(train_x, train_y) return model # SVM Classifier using cross validation def svm_cross_validation(train_x, train_y): from sklearn.grid_search import GridSearchCV from sklearn.svm import SVC model = SVC(kernel='rbf', probability=True) param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]} grid_search = GridSearchCV(model, param_grid, n_jobs=1, verbose=1) grid_search.fit(train_x, train_y) best_parameters = grid_search.best_estimator_.get_params() for para, val in best_parameters.items(): print para, val model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True) model.fit(train_x, train_y) return model def read_data_mnist(data_file): import gzip f = gzip.open(data_file, "rb") train, val, test = pickle.load(f) f.close() train_x = train[0] train_y = train[1] test_x = test[0] test_y = test[1] return train_x, train_y, test_x, test_y def read_data_conversation(data_file): data_x = [] data_y = [] with open(data_file) as f: for line in f: strArray = line.split(" ") floatArray = [float(x) for x in strArray] data_x.append(floatArray[1:]) data_y.append(floatArray[0]) return np.array(data_x), np.array(data_y) def read_data(train_file, test_file): train_x, train_y = read_data_conversation(train_file) test_x, test_y = read_data_conversation(test_file) return train_x, train_y, test_x, test_y def evaluate(is_binary_class, predict, predict_pos, test_y): if is_binary_class: precision = metrics.precision_score(test_y, predict) recall = metrics.recall_score(test_y, predict) print 'precision: %.3f%%\nrecall: %.3f%%' % (100 * precision, 100 * recall) accuracy = metrics.accuracy_score(test_y, predict) print 'accuracy: %.3f%%' % (100 * accuracy) roc_auc = metrics.roc_auc_score(test_y, predict_pos) print 'roc_auc: %.3f' % roc_auc if __name__ == '__main__': data_file = "mnist.pkl.gz" thresh = 0.9 model_save_file = None model_save = {} test_classifiers = ['NB', # 'KNN', 'LR', 'RF', 'DT', 'SVM', 'GBDT' ] classifiers = {'NB': naive_bayes_classifier, 'KNN': knn_classifier, 'LR': logistic_regression_classifier, 'RF': random_forest_classifier, 'DT': decision_tree_classifier, 'SVM': svm_classifier, 'SVMCV': svm_cross_validation, 'GBDT': gradient_boosting_classifier } print 'reading training and testing data...' train_x, train_y, test_x, test_y = read_data("QAFormatResult-train-format.txt", "QAFormatResult-test-format.txt") num_train, num_feat = train_x.shape num_test, num_feat = test_x.shape is_binary_class = (len(np.unique(train_y)) == 2) print '******************** Data Info *********************' print '#training data: %d, #testing_data: %d, dimension: %d' % (num_train, num_test, num_feat) print 'testing train data... ' print train_x[0] print train_y[0] print 'testing test data... ' print test_x[0] print test_y[0] ensemble_train_x = None ensemble_test_x = None voting_predict = None for classifier in test_classifiers: print '******************* %s ********************' % classifier start_time = time.time() model = classifiers[classifier](train_x, train_y) print 'training took %fs!' % (time.time() - start_time) predict_proba = model.predict_proba(test_x) predict_pos = predict_proba[:, 1] predict = np.array([int(x + 0.5) for x in predict_pos.tolist()]) # print predict # predict = model.predict(test_x) if voting_predict is None: voting_predict = predict else: voting_predict = np.vstack((voting_predict, predict)) if ensemble_test_x is None: ensemble_test_x = predict_pos else: ensemble_test_x = np.vstack((ensemble_test_x, predict_pos)) train_pos = model.predict_proba(train_x)[:, 1] if ensemble_train_x is None: ensemble_train_x = train_pos else: ensemble_train_x = np.vstack((ensemble_train_x, train_pos)) if model_save_file != None: model_save[classifier] = model evaluate(is_binary_class, predict, predict_pos, test_y) ensemble_train_x = ensemble_train_x.T ensemble_test_x = ensemble_test_x.T print '******************* ensemble ********************' start_time = time.time() model = logistic_regression_classifier(ensemble_train_x, train_y) print 'training took %fs!' % (time.time() - start_time) predict_proba = model.predict_proba(ensemble_test_x) predict_pos = predict_proba[:, 1] predict = np.array([int(x + 0.5) for x in predict_pos.tolist()]) # print predict evaluate(is_binary_class, predict, predict_pos, test_y) voting_predict = voting_predict.T print '******************* voting ********************' voting_predict = np.sum(voting_predict, axis=1) predict = np.array([int(2 * (x - 0.1) / len(test_classifiers)) for x in voting_predict.tolist()]) # print predict evaluate(is_binary_class, predict, predict, test_y) if model_save_file != None: pickle.dump(model_save, open(model_save_file, 'wb'))
相关文章推荐
- Python动态类型的学习---引用的理解
- Python3写爬虫(四)多线程实现数据爬取
- 垃圾邮件过滤器 python简单实现
- 下载并遍历 names.txt 文件,输出长度最长的回文人名。
- install and upgrade scrapy
- Scrapy的架构介绍
- Centos6 编译安装Python
- 使用Python生成Excel格式的图片
- 让Python文件也可以当bat文件运行
- [Python]推算数独
- Python中zip()函数用法举例
- Python中map()函数浅析
- Python将excel导入到mysql中
- Python在CAM软件Genesis2000中的应用
- 使用Shiboken为C++和Qt库创建Python绑定
- FREEBASIC 编译可被python调用的dll函数示例
- Python 七步捉虫法