您的位置:首页 > 大数据

《 大数据竞赛平台——Kaggle 入门篇》的补充

2017-02-22 12:03 791 查看

《 大数据竞赛平台——Kaggle 入门篇》的补充

博客原文链接:

[ http://blog.csdn.net/u012162613/article/details/41929171 ]

knn_benchmark.csv下载

github的连接

https://github.com/clytwynec/digit_recognition/blob/master/data/knn_benchmark.csv

将得到result.csv转换为提交文件格式:

Python代码

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 21 21:35:48 2017

@author: xyz
"""
from numpy import *
import csv

def toInt(array):
array=mat(array)
m,n=shape(array)
newArray=zeros((m,n))
for i in xrange(m):
for j in xrange(n):
newArray[i,j]=int(float(label[i,j]))#string类型要先转换为float再转换为int
return newArray

def loadPredictResult():
l=[]
with open('sklearn_knn_Result.csv') as file:
lines=csv.reader(file)
for line in lines:
l.append(line)#28001*2
#l.remove(l[0])
label=array(l)
return toInt(label)  #  label 28000*1

def saveResult(result,csvName):
with open(csvName,'wb') as myFile:
myWriter=csv.writer(myFile)
myWriter.writerow(['ImageId','Label'])
indexid=1
for i in result:
#tmp=[]
#tmp.append(i)
myWriter.writerow([indexid,int(i[0])])#每个i是一个array
indexid=indexid+1

resultlabel=loadPredictResult()
saveResult(resultlabel,'processed-result.csv')


利用Kears构建ANN模型:

参考博客《【Python与机器学习】:利用Keras进行多类分类》

[ http://www.cnblogs.com/arkenstone/p/5943489.html?utm_source=itdadao&utm_medium=referral ]

输入特征为784,利用one-hot encoder对0-9这10个分类标签编码,综上,输入神经元个数为784,输出神经元个数为10,隐藏节点数为输入节点数和输出节点数乘积的平方根,hidnodes=784×10−−−−−−−√=88

神经网络结构如图:



Python代码

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 16 21:59:00 2014

@author: wepon

@blog:http://blog.csdn.net/u012162613
"""

import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
#from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder

# load train dataset X_train:data Y:label
dataframe = pd.read_csv("train.csv")
dataset = dataframe.values
#X = dataset[:, 1:].astype(float)
X_train= dataset[:, 1:].astype(int)
Y = dataset[:, 0].astype(int)

# load test dataset X_test:data
testdataframe = pd.read_csv("test.csv")
testdataset = testdataframe.values
#X = dataset[:, 1:].astype(float)
X_test= testdataset.astype(int)

# load validation dataset
validdataframe = pd.read_csv("knn_benchmark.csv")
validdataset = validdataframe.values
#X = dataset[:, 1:].astype(float)

# encode class values as integers ???label encoders are 10 calatorgry?????????????
encoder = LabelEncoder()
def labelencode(Y):
encoded_Y = encoder.fit_transform(Y)
# convert integers to dummy variables (one hot encoding)
dummy_y = np_utils.to_categorical(encoded_Y)
return dummy_y

# define model structure 隐藏节点数为输入节点数和输出节点数乘积的开平方
def baseline_model():
model = Sequential()
model.add(Dense(output_dim=88, input_dim=784, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(output_dim=10, input_dim=88, activation='softmax'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=40, batch_size=256)
#指定每个batch的大小batch_size,指定训练轮数nb_epoch
# splitting data into training set and test set. If random_state is set to an integer, the split datasets are fixed.
#X_train, X_test, Y_train, Y_test = train_test_split(X, dummy_y, test_size=0.3, random_state=0)
Y_train=labelencode(Y)
estimator.fit(X_train, Y_train)

# make predictions
pred = estimator.predict(X_test)

# inverse numeric variables to initial categorical labels
init_lables = encoder.inverse_transform(pred)
#print init_lables[0]
init_lables=pd.DataFrame({'label':init_lables})#array convert to DataFrame
validdataframe['prelabel']=init_lables['label']
validdataframe['accuracy']=validdataframe['prelabel']==validdataframe['Label']
totalcount=validdataframe['accuracy'].count()
print 'ANN accuarcy:',validdataframe['accuracy'].sum()/1.0/totalcount
#write predict results to submision csv
validdataframe.to_csv('submision.csv',columns=['ImageId','Label'],index=False)


[在Kaggle上的排名]



比较不同模型的分类效果,看看CNN的效果如何:

如何用卷积神经网络CNN识别手写数字集?

http://blog.csdn.net/zdy0_2004/article/details/51945932

先给出CNN网络结构模型



代码

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 16 21:59:00 2014

@author: wepon

@blog:http://blog.csdn.net/u012162613
"""

import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout,Convolution2D,Reshape,AveragePooling2D, Flatten
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import SGD
from keras.utils import np_utils
#from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder

# load train dataset X_train:data Y:label
#dataframe = pd.read_csv("train.csv", header=None)
dataframe = pd.read_csv("train.csv")
dataset = dataframe.values
#X = dataset[:, 1:].astype(float)
X_train= dataset[:, 1:].astype(int)
Y = dataset[:, 0].astype(int)

# load test dataset X_test:data
#testdataframe = pd.read_csv("test.csv", header=None)
testdataframe = pd.read_csv("test.csv")
testdataset = testdataframe.values
#X = dataset[:, 1:].astype(float)
X_test= testdataset.astype(int)

# load validation dataset
validdataframe = pd.read_csv("knn_benchmark.csv")
validdataset = validdataframe.values
#X = dataset[:, 1:].astype(float)

# encode class values as integers
encoder = LabelEncoder()
def labelencode(Y):
encoded_Y = encoder.fit_transform(Y)
# convert integers to dummy variables (one hot encoding)
dummy_y = np_utils.to_categorical(encoded_Y)
return dummy_y

Y_train=labelencode(Y)
# define model structure 784-88-10
def baseline_model():
model = Sequential()
model.add(Dense(output_dim=85, input_dim=784, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(output_dim=10, input_dim=85, activation='softmax'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
# define cnn model
def cnn_model():
model = Sequential()
#Reshape层用来将输入shape转换为特定的shape
model.add(Reshape(target_shape=(1,28,28),input_shape=(784,)))
model.add(Convolution2D(nb_filter=32,nb_row=3,nb_col=3,dim_ordering='th',border_mode='same',bias=False,init='uniform'))
#model.add(Convolution2D(32, 3, 3,border_mode='same',input_shape=(1, 96, 96)))
model.add(AveragePooling2D(pool_size=(2,2),dim_ordering='th'))
model.add(Convolution2D(nb_filter=64,nb_row=3,nb_col=3,dim_ordering='th',border_mode='same',bias=False,init='uniform'))
model.add(AveragePooling2D(pool_size=(2,2),dim_ordering='th'))
# the model so far outputs 3D feature maps (height, width, features)
model.add(Flatten()) # this converts our 3D feature maps to 1D feature vectors
#Dense就是常用的全连接层
model.add(Dense(output_dim=1000,activation='sigmoid'))
model.add(Dense(output_dim=1000,activation='sigmoid'))
model.add(Dense(output_dim=10,activation='linear'))
sgd = SGD(lr='0.01',decay=1e-6,momentum=0.9, nesterov=True)
model.compile(loss='mean_squared_error', optimizer=sgd,metrics=['accuracy'])
return model
model_name='cnn'
estimator = KerasClassifier(build_fn=cnn_model, nb_epoch=40, batch_size=256)
# splitting data into training set and test set. If random_state is set to an integer, the split datasets are fixed.
#X_train, X_test, Y_train, Y_test = train_test_split(X, dummy_y, test_size=0.3, random_state=0)

estimator.fit(X_train, Y_train)

# make predictions
pred = estimator.predict(X_test)

# inverse numeric variables to initial categorical labels
init_lables = encoder.inverse_transform(pred)
#print init_lables[0]
#init_lables=pd.DataFrame({'label':init_lables[1:]})
init_lables=pd.DataFrame({'label':init_lables})
validdataframe['prelabel']=init_lables['label']
validdataframe['accuracy']=validdataframe['prelabel']==validdataframe['Label']
totalcount=validdataframe['accuracy'].count()
print '%s accuarcy:'%model_name,validdataframe['accuracy'].sum()/1.0/totalcount
#write to submision csv
validdataframe.to_csv('submision_%s.csv'%model_name,columns=['ImageId','Label'],index=False)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  Kaggle Digit KNN