《 大数据竞赛平台——Kaggle 入门篇》的补充
2017-02-22 12:03
791 查看
《 大数据竞赛平台——Kaggle 入门篇》的补充
博客原文链接:[ http://blog.csdn.net/u012162613/article/details/41929171 ]
knn_benchmark.csv下载
github的连接https://github.com/clytwynec/digit_recognition/blob/master/data/knn_benchmark.csv
将得到result.csv转换为提交文件格式:
Python代码
#!/usr/bin/env python2 # -*- coding: utf-8 -*- """ Created on Tue Feb 21 21:35:48 2017 @author: xyz """ from numpy import * import csv def toInt(array): array=mat(array) m,n=shape(array) newArray=zeros((m,n)) for i in xrange(m): for j in xrange(n): newArray[i,j]=int(float(label[i,j]))#string类型要先转换为float再转换为int return newArray def loadPredictResult(): l=[] with open('sklearn_knn_Result.csv') as file: lines=csv.reader(file) for line in lines: l.append(line)#28001*2 #l.remove(l[0]) label=array(l) return toInt(label) # label 28000*1 def saveResult(result,csvName): with open(csvName,'wb') as myFile: myWriter=csv.writer(myFile) myWriter.writerow(['ImageId','Label']) indexid=1 for i in result: #tmp=[] #tmp.append(i) myWriter.writerow([indexid,int(i[0])])#每个i是一个array indexid=indexid+1 resultlabel=loadPredictResult() saveResult(resultlabel,'processed-result.csv')
利用Kears构建ANN模型:
参考博客《【Python与机器学习】:利用Keras进行多类分类》
[ http://www.cnblogs.com/arkenstone/p/5943489.html?utm_source=itdadao&utm_medium=referral ]输入特征为784,利用one-hot encoder对0-9这10个分类标签编码,综上,输入神经元个数为784,输出神经元个数为10,隐藏节点数为输入节点数和输出节点数乘积的平方根,hidnodes=784×10−−−−−−−√=88
神经网络结构如图:Python代码
#!/usr/bin/python # -*- coding: utf-8 -*- """ Created on Tue Dec 16 21:59:00 2014 @author: wepon @blog:http://blog.csdn.net/u012162613 """ import numpy as np import pandas as pd from keras.models import Sequential from keras.layers import Dense, Dropout from keras.wrappers.scikit_learn import KerasClassifier from keras.utils import np_utils #from sklearn.model_selection import train_test_split, KFold, cross_val_score from sklearn.preprocessing import LabelEncoder # load train dataset X_train:data Y:label dataframe = pd.read_csv("train.csv") dataset = dataframe.values #X = dataset[:, 1:].astype(float) X_train= dataset[:, 1:].astype(int) Y = dataset[:, 0].astype(int) # load test dataset X_test:data testdataframe = pd.read_csv("test.csv") testdataset = testdataframe.values #X = dataset[:, 1:].astype(float) X_test= testdataset.astype(int) # load validation dataset validdataframe = pd.read_csv("knn_benchmark.csv") validdataset = validdataframe.values #X = dataset[:, 1:].astype(float) # encode class values as integers ???label encoders are 10 calatorgry????????????? encoder = LabelEncoder() def labelencode(Y): encoded_Y = encoder.fit_transform(Y) # convert integers to dummy variables (one hot encoding) dummy_y = np_utils.to_categorical(encoded_Y) return dummy_y # define model structure 隐藏节点数为输入节点数和输出节点数乘积的开平方 def baseline_model(): model = Sequential() model.add(Dense(output_dim=88, input_dim=784, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(output_dim=10, input_dim=88, activation='softmax')) # Compile model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=40, batch_size=256) #指定每个batch的大小batch_size,指定训练轮数nb_epoch # splitting data into training set and test set. If random_state is set to an integer, the split datasets are fixed. #X_train, X_test, Y_train, Y_test = train_test_split(X, dummy_y, test_size=0.3, random_state=0) Y_train=labelencode(Y) estimator.fit(X_train, Y_train) # make predictions pred = estimator.predict(X_test) # inverse numeric variables to initial categorical labels init_lables = encoder.inverse_transform(pred) #print init_lables[0] init_lables=pd.DataFrame({'label':init_lables})#array convert to DataFrame validdataframe['prelabel']=init_lables['label'] validdataframe['accuracy']=validdataframe['prelabel']==validdataframe['Label'] totalcount=validdataframe['accuracy'].count() print 'ANN accuarcy:',validdataframe['accuracy'].sum()/1.0/totalcount #write predict results to submision csv validdataframe.to_csv('submision.csv',columns=['ImageId','Label'],index=False)
[在Kaggle上的排名]
比较不同模型的分类效果,看看CNN的效果如何:
如何用卷积神经网络CNN识别手写数字集?
http://blog.csdn.net/zdy0_2004/article/details/51945932先给出CNN网络结构模型
代码
#!/usr/bin/python # -*- coding: utf-8 -*- """ Created on Tue Dec 16 21:59:00 2014 @author: wepon @blog:http://blog.csdn.net/u012162613 """ import numpy as np import pandas as pd from keras.models import Sequential from keras.layers import Dense, Dropout,Convolution2D,Reshape,AveragePooling2D, Flatten from keras.wrappers.scikit_learn import KerasClassifier from keras.optimizers import SGD from keras.utils import np_utils #from sklearn.model_selection import train_test_split, KFold, cross_val_score from sklearn.preprocessing import LabelEncoder # load train dataset X_train:data Y:label #dataframe = pd.read_csv("train.csv", header=None) dataframe = pd.read_csv("train.csv") dataset = dataframe.values #X = dataset[:, 1:].astype(float) X_train= dataset[:, 1:].astype(int) Y = dataset[:, 0].astype(int) # load test dataset X_test:data #testdataframe = pd.read_csv("test.csv", header=None) testdataframe = pd.read_csv("test.csv") testdataset = testdataframe.values #X = dataset[:, 1:].astype(float) X_test= testdataset.astype(int) # load validation dataset validdataframe = pd.read_csv("knn_benchmark.csv") validdataset = validdataframe.values #X = dataset[:, 1:].astype(float) # encode class values as integers encoder = LabelEncoder() def labelencode(Y): encoded_Y = encoder.fit_transform(Y) # convert integers to dummy variables (one hot encoding) dummy_y = np_utils.to_categorical(encoded_Y) return dummy_y Y_train=labelencode(Y) # define model structure 784-88-10 def baseline_model(): model = Sequential() model.add(Dense(output_dim=85, input_dim=784, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(output_dim=10, input_dim=85, activation='softmax')) # Compile model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model # define cnn model def cnn_model(): model = Sequential() #Reshape层用来将输入shape转换为特定的shape model.add(Reshape(target_shape=(1,28,28),input_shape=(784,))) model.add(Convolution2D(nb_filter=32,nb_row=3,nb_col=3,dim_ordering='th',border_mode='same',bias=False,init='uniform')) #model.add(Convolution2D(32, 3, 3,border_mode='same',input_shape=(1, 96, 96))) model.add(AveragePooling2D(pool_size=(2,2),dim_ordering='th')) model.add(Convolution2D(nb_filter=64,nb_row=3,nb_col=3,dim_ordering='th',border_mode='same',bias=False,init='uniform')) model.add(AveragePooling2D(pool_size=(2,2),dim_ordering='th')) # the model so far outputs 3D feature maps (height, width, features) model.add(Flatten()) # this converts our 3D feature maps to 1D feature vectors #Dense就是常用的全连接层 model.add(Dense(output_dim=1000,activation='sigmoid')) model.add(Dense(output_dim=1000,activation='sigmoid')) model.add(Dense(output_dim=10,activation='linear')) sgd = SGD(lr='0.01',decay=1e-6,momentum=0.9, nesterov=True) model.compile(loss='mean_squared_error', optimizer=sgd,metrics=['accuracy']) return model model_name='cnn' estimator = KerasClassifier(build_fn=cnn_model, nb_epoch=40, batch_size=256) # splitting data into training set and test set. If random_state is set to an integer, the split datasets are fixed. #X_train, X_test, Y_train, Y_test = train_test_split(X, dummy_y, test_size=0.3, random_state=0) estimator.fit(X_train, Y_train) # make predictions pred = estimator.predict(X_test) # inverse numeric variables to initial categorical labels init_lables = encoder.inverse_transform(pred) #print init_lables[0] #init_lables=pd.DataFrame({'label':init_lables[1:]}) init_lables=pd.DataFrame({'label':init_lables}) validdataframe['prelabel']=init_lables['label'] validdataframe['accuracy']=validdataframe['prelabel']==validdataframe['Label'] totalcount=validdataframe['accuracy'].count() print '%s accuarcy:'%model_name,validdataframe['accuracy'].sum()/1.0/totalcount #write to submision csv validdataframe.to_csv('submision_%s.csv'%model_name,columns=['ImageId','Label'],index=False)
相关文章推荐
- 大数据竞赛平台——Kaggle 入门
- Kaggle大数据竞赛平台入门
- 大数据竞赛平台——Kaggle 入门篇
- [Kaggle] 数据建模分析与竞赛平台介绍
- 大数据竞赛平台——Kaggle 入门
- 大数据竞赛平台——Kaggle 入门篇
- 大数据竞赛平台——Kaggle 入门篇
- 大数据竞赛平台——Kaggle 入门篇
- 大数据竞赛平台——Kaggle 入门
- Kaggle大数据竞赛平台入门
- 大数据竞赛平台——Kaggle 入门
- 大数据竞赛平台——Kaggle 入门
- 大数据竞赛平台——Kaggle 入门
- 大数据竞赛平台——Kaggle 入门篇
- 大数据竞赛平台——Kaggle 入门篇
- 大数据竞赛平台——Kaggle 入门
- 大数据竞赛平台——Kaggle 入门篇
- 大数据竞赛平台——Kaggle 入门篇
- 大数据竞赛平台——Kaggle 入门
- [Kaggle] 数据建模分析与竞赛平台介绍