您的位置:首页 > 理论基础 > 计算机网络

python分模块实现神经网络之CIFARF分类:1.加载CIFAR数据集中的数据

2017-12-30 23:08 549 查看
加载CIFAR数据集中的数据是在data_utils.py文件中完成的,下面是加载文件的代码:

import numpy as np
import os
#import cPickle as pickle
import pickle as pk

def load_CIFAR_batch(filename):
""" load single batch of cifar """
with open(filename, 'rb') as f:
datadict = pk.load(f,encoding='latin1')
X = datadict['data']
Y = datadict['labels']
X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
Y = np.array(Y)
return X, Y

def load_CIFAR10(ROOT):
""" load all of cifar """
xs = []
ys = []
for b in range(1,2):
f = os.path.join(ROOT, 'data_batch_%d' % (b, ))
#print(f)
X, Y = load_CIFAR_batch(f)
xs.append(X)
ys.append(Y)
Xtr = np.concatenate(xs)
Ytr = np.concatenate(ys)
del X, Y
Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch'))
return Xtr, Ytr, Xte, Yte

def get_CIFAR10_data(num_training=5000, num_validation=500, num_test=500):
"""
Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
it for classifiers. These are the same steps as we used for the SVM, but
condensed to a single function.
"""
# Load the raw CIFAR-10 data
cifar10_dir = 'D://cifar-10-python//cifar-10-batches-py//'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
#print(X_train.shape)
# Subsample the data
mask = range(num_training, num_training + num_validation)
print(mask)
print("X_train")
print(X_train.shape)
X_val = X_train[mask]
print("X_val")
print(X_val.shape)
y_val = y_train[mask]
mask = range(num_training)
print(mask)
print("X_train:num_training1")
print(X_train.shape)
X_train = X_train[mask]
print("X_train:num_training2")
print(X_train.shape)
y_train = y_train[mask]
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]

# Normalize the data: subtract the mean image
mean_image = np.mean(X_train, axis=0)
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image

# Transpose so that channels come first
X_train = X_train.transpose(0, 3, 1, 2).copy()
X_val = X_val.transpose(0, 3, 1, 2).copy()
X_test = X_test.transpose(0, 3, 1, 2).copy()

# Package data into a dictionary
return {
'X_train': X_train, 'y_train': y_train,
'X_val': X_val, 'y_val': y_val,
'X_test': X_test, 'y_test': y_test,
}


其中,

def load_CIFAR10(ROOT):
""" load all of cifar """
xs = []
ys = []
for b in range(1,2):
f = os.path.join(ROOT, 'data_batch_%d' % (b, ))
X, Y = load_CIFAR_batch(f)
xs.append(X)
ys.append(Y)
Xtr = np.concatenate(xs)
Ytr = np.concatenate(ys)
del X, Y
Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch'))
return Xtr, Ytr, Xte, Yteload_CIFAR10()函数中,append()函数是向列表list中添加一个元素或多个元素,numpy库中concatenate()函数的作用是,沿着一个坐标轴连接多个数组。



mask = range(num_training, num_training + num_validation)
print(mask)
print("X_train")
print(X_train.shape)
X_val = X_train[mask]
print("X_val")
print(X_val.shape)
y_val = y_train[mask]
mask = range(num_training)
print(mask)
print("X_train:num_training1")
print(X_train.shape)
X_train = X_train[mask]
print("X_train:num_training2")
print(X_train.shape)
y_train = y_train[mask]
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]

主要完成数据切分的工作,打印结果为:

range(5000, 5500)
X_train
(10000, 32, 32, 3)
X_val
(500, 32, 32, 3)
range(0, 5000)
X_train:num_training1
(10000, 32, 32, 3)
X_train:num_training2
(5000, 32, 32, 3)


因为训练模型时,输入的数据格式为(batch_size,channel,height,width)从X_trian的shape中我们看到,(10000,32,32,3),10000是数据条数,32分别是高度和宽度,3是channel即信号通道,所以在训练模型之前需要进行如下操作:

# Transpose so that channels come first
X_train = X_train.transpose(0, 3, 1, 2).copy()
X_val = X_val.transpose(0, 3, 1, 2).copy()
X_test = X_test.transpose(0, 3, 1, 2).copy()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息