您的位置：首页 > 理论基础 > 计算机网络

keras：一台设备上同时使用多张显卡训练同一个网络模型

2018-03-02 13:52 447 查看

Reference：

【简述-zzw】Keras同时用多张显卡训练网络

【知乎】如何让keras训练深度网络时使用两张显卡？

以 tensorflow 为后端，有两种方法可以在多张GPU上运行一个模型：数据并行和设备并行，参考keras中文文档。

数据并行：

数据并行将目标模型在多个设备上各复制一份，并使用每个设备上的复制品处理整个数据集的不同部分数据。Keras在keras.utils.multi_gpu_model中提供有内置函数，该函数可以产生任意模型的数据并行版本，最高支持在8片GPU上并行。请参考utils中的multi_gpu_model文档。下面是一个例子：

from keras.utils import multi_gpu_model

# Replicates `model` on 8 GPUs.
# This assumes that your machine has 8 available GPUs.
parallel_model = multi_gpu_model(model, gpus=8)
parallel_model.compile(loss='categorical_crossentropy',
optimizer='rmsprop')

# This `fit` call will be distributed on 8 GPUs.
# Since the batch size is 256, each GPU will process 32 samples.
parallel_model.fit(x, y, epochs=20, batch_size=256)

设备并行：

设备并行是在不同设备上运行同一个模型的不同部分，当模型含有多个并行结构，例如含有两个分支时，这种方式很适合。这种并行方法可以通过使用TensorFlow device scopes实现，下面是一个例子：

# Model where a shared LSTM is used to encode two different sequences in parallel
input_a = keras.Input(shape=(140, 256))
input_b = keras.Input(shape=(140, 256))

shared_lstm = keras.layers.LSTM(64)

# Process the first sequence on one GPU
with tf.device_scope('/gpu:0'):
encoded_a = shared_lstm(tweet_a)
# Process the next sequence on another GPU
with tf.device_scope('/gpu:1'):
encoded_b = shared_lstm(tweet_b)

# Concatenate results on CPU
with tf.device_scope('/cpu:0'):
merged_vector = keras.layers.concatenate([encoded_a, encoded_b],
axis=-1)

以keras框架使用两张GPU训练 inception_v4 模型为例：

# -*- coding: utf-8 -*-
import numpy as np

from keras.models import Sequential
from keras.layers import Input, Dense, Convolution2D, MaxPooling2D, AveragePooling2D, ZeroPadding2D, Dropout, Flatten, merge, Reshape, Activation
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras import backend as K
from sklearn.metrics import log_loss
# from load_cifar10 import load_cifar10_data

from keras.preprocessing.image import ImageDataGenerator

from keras import optimizers
import keras
import tensorflow as tf

from keras.utils import multi_gpu_model

def conv2d_bn(x, nb_filter, nb_row, nb_col,
border_mode='same', subsample=(1, 1), bias=False):
"""
Utility function to apply conv + BN.
(Slightly modified from https://github.com/fchollet/keras/blob/master/keras/applications/inception_v3.py) """
if K.image_dim_ordering() == "th":
channel_axis = 1
else:
channel_axis = -1
x = Convolution2D(nb_filter, nb_row, nb_col,
subsample=subsample,
border_mode=border_mode,
bias=bias)(x)
x = BatchNormalization(axis=channel_axis)(x)
x = Activation('relu')(x)
return x

def block_inception_a(input):
if K.image_dim_ordering() == "th":
channel_axis = 1
else:
channel_axis = -1

branch_0 = conv2d_bn(input, 96, 1, 1)

branch_1 = conv2d_bn(input, 64, 1, 1)
branch_1 = conv2d_bn(branch_1, 96, 3, 3)

branch_2 = conv2d_bn(input, 64, 1, 1)
branch_2 = conv2d_bn(branch_2, 96, 3, 3)
branch_2 = conv2d_bn(branch_2, 96, 3, 3)

branch_3 = AveragePooling2D((3,3), strides=(1,1), border_mode='same')(input)
branch_3 = conv2d_bn(branch_3, 96, 1, 1)

x = merge([branch_0, branch_1, branch_2, branch_3], mode='concat', concat_axis=channel_axis)
return x

def block_reduction_a(input):
if K.image_dim_ordering() == "th":
channel_axis = 1
else:
channel_axis = -1

branch_0 = conv2d_bn(input, 384, 3, 3, subsample=(2,2), border_mode='valid')

branch_1 = conv2d_bn(input, 192, 1, 1)
branch_1 = conv2d_bn(branch_1, 224, 3, 3)
branch_1 = conv2d_bn(branch_1, 256, 3, 3, subsample=(2,2), border_mode='valid')

branch_2 = MaxPooling2D((3,3), strides=(2,2), border_mode='valid')(input)

x = merge([branch_0, branch_1, branch_2], mode='concat', concat_axis=channel_axis)
return x

def block_inception_b(input):
if K.image_dim_ordering() == "th":
channel_axis = 1
else:
channel_axis = -1

branch_0 = conv2d_bn(input, 384, 1, 1)

branch_1 = conv2d_bn(input, 192, 1, 1)
branch_1 = conv2d_bn(branch_1, 224, 1, 7)
branch_1 = conv2d_bn(branch_1, 256, 7, 1)

branch_2 = conv2d_bn(input, 192, 1, 1)
branch_2 = conv2d_bn(branch_2, 192, 7, 1)
branch_2 = conv2d_bn(branch_2, 224, 1, 7)
branch_2 = conv2d_bn(branch_2, 224, 7, 1)
branch_2 = conv2d_bn(branch_2, 256, 1, 7)

branch_3 = AveragePooling2D((3,3), strides=(1,1), border_mode='same')(input)
branch_3 = conv2d_bn(branch_3, 128, 1, 1)

x = merge([branch_0, branch_1, branch_2, branch_3], mode='concat', concat_axis=channel_axis)
return x

def block_reduction_b(input):
if K.image_dim_ordering() == "th":
channel_axis = 1
else:
channel_axis = -1

branch_0 = conv2d_bn(input, 192, 1, 1)
branch_0 = conv2d_bn(branch_0, 192, 3, 3, subsample=(2, 2), border_mode='valid')

branch_1 = conv2d_bn(input, 256, 1, 1)
branch_1 = conv2d_bn(branch_1, 256, 1, 7)
branch_1 = conv2d_bn(branch_1, 320, 7, 1)
branch_1 = conv2d_bn(branch_1, 320, 3, 3, subsample=(2,2), border_mode='valid')

branch_2 = MaxPooling2D((3, 3), strides=(2, 2), border_mode='valid')(input)

x = merge([branch_0, branch_1, branch_2], mode='concat', concat_axis=channel_axis)
return x

def block_inception_c(input):
if K.image_dim_ordering() == "th":
channel_axis = 1
else:
channel_axis = -1

branch_0 = conv2d_bn(input, 256, 1, 1)

branch_1 = conv2d_bn(input, 384, 1, 1)
branch_10 = conv2d_bn(branch_1, 256, 1, 3)
branch_11 = conv2d_bn(branch_1, 256, 3, 1)
branch_1 = merge([branch_10, branch_11], mode='concat', concat_axis=channel_axis)

branch_2 = conv2d_bn(input, 384, 1, 1)
branch_2 = conv2d_bn(branch_2, 448, 3, 1)
branch_2 = conv2d_bn(branch_2, 512, 1, 3)
branch_20 = conv2d_bn(branch_2, 256, 1, 3)
branch_21 = conv2d_bn(branch_2, 256, 3, 1)
branch_2 = merge([branch_20, branch_21], mode='concat', concat_axis=channel_axis)

branch_3 = AveragePooling2D((3, 3), strides=(1, 1), border_mode='same')(input)
branch_3 = conv2d_bn(branch_3, 256, 1, 1)

x = merge([branch_0, branch_1, branch_2, branch_3], mode='concat', concat_axis=channel_axis)
return x

def inception_v4_base(input):
if K.image_dim_ordering() == "th":
channel_axis = 1
else:
channel_axis = -1

# Input Shape is 299 x 299 x 3 (th) or 3 x 299 x 299 (th)
net = conv2d_bn(input, 32, 3, 3, subsample=(2,2), border_mode='valid')
net = conv2d_bn(net, 32, 3, 3, border_mode='valid')
net = conv2d_bn(net, 64, 3, 3)

branch_0 = MaxPooling2D((3,3), strides=(2,2), border_mode='valid')(net)

branch_1 = conv2d_bn(net, 96, 3, 3, subsample=(2,2), border_mode='valid')

net = merge([branch_0, branch_1], mode='concat', concat_axis=channel_axis)

branch_0 = conv2d_bn(net, 64, 1, 1)
branch_0 = conv2d_bn(branch_0, 96, 3, 3, border_mode='valid')

branch_1 = conv2d_bn(net, 64, 1, 1)
branch_1 = conv2d_bn(branch_1, 64, 1, 7)
branch_1 = conv2d_bn(branch_1, 64, 7, 1)
branch_1 = conv2d_bn(branch_1, 96, 3, 3, border_mode='valid')

net = merge([branch_0, branch_1], mode='concat', concat_axis=channel_axis)

branch_0 = conv2d_bn(net, 192, 3, 3, subsample=(2,2), border_mode='valid')
branch_1 = MaxPooling2D((3,3), strides=(2,2), border_mode='valid')(net)

net = merge([branch_0, branch_1], mode='concat', concat_axis=channel_axis)

# 35 x 35 x 384
# 4 x Inception-A blocks
for idx in xrange(4):
net = block_inception_a(net)

# 35 x 35 x 384
# Reduction-A block
net = block_reduction_a(net)

# 17 x 17 x 1024
# 7 x Inception-B blocks
for idx in xrange(7):
net = block_inception_b(net)

# 17 x 17 x 1024
# Reduction-B block
net = block_reduction_b(net)

# 8 x 8 x 1536
# 3 x Inception-C blocks
for idx in xrange(3):
net = block_inception_c(net)

return net

def inception_v4_model(img_rows, img_cols, color_type=1, num_classes=None, dropout_keep_prob=0.2):
'''
Inception V4 Model for Keras

Model Schema is based on https://github.com/kentsommer/keras-inceptionV4 
ImageNet Pretrained Weights
Theano: https://github.com/kentsommer/keras-inceptionV4/releases/download/2.0/inception-v4_weights_th_dim_ordering_th_kernels.h5 TensorFlow: https://github.com/kentsommer/keras-inceptionV4/releases/download/2.0/inception-v4_weights_tf_dim_ordering_tf_kernels.h5 
Parameters:
img_rows, img_cols - resolution of inputs
channel - 1 for grayscale, 3 for color
num_classes - number of class labels for our classification task
'''

# Input Shape is 299 x 299 x 3 (tf) or 3 x 299 x 299 (th)
if K.image_dim_ordering() == 'th':
inputs = Input((3, 299, 299))
else:
inputs = Input((299, 299, 3))

# Make i
d6fd
nception base
net = inception_v4_base(inputs)

# Final pooling and prediction

# 8 x 8 x 1536
net_old = AveragePooling2D((8,8), border_mode='valid')(net)

# 1 x 1 x 1536
net_old = Dropout(dropout_keep_prob)(net_old)
net_old = Flatten()(net_old)

# 1536
predictions = Dense(output_dim=1001, activation='softmax')(net_old)

model = Model(inputs, predictions, name='inception_v4')

if K.image_dim_ordering() == 'th':
# Use pre-trained weights for Theano backend
weights_path = 'imagenet_models/inception-v4_weights_th_dim_ordering_th_kernels.h5'
else:
# Use pre-trained weights for Tensorflow backend
weights_path = 'imagenet_models/inception-v4_weights_tf_dim_ordering_tf_kernels.h5'

# weights_path = './InceptionV4_model_fold_01.h5'
model.load_weights(weights_path, by_name=True)

# Truncate and replace softmax layer for transfer learning
# Cannot use model.layers.pop() since model is not of Sequential() type
# The method below works since pre-trained weights are stored in layers but not in the model
net_ft = AveragePooling2D((8,8), border_mode='valid')(net)
net_ft = Dropout(dropout_keep_prob)(net_ft)
net_ft = Flatten()(net_ft)
predictions_ft = Dense(output_dim=num_classes, activation='softmax')(net_ft)

model = Model(inputs, predictions_ft, name='inception_v4')

return model

if __name__ == '__main__':

# import os
# os.environ['CUDA_VISIBLE_DEVICES']='0'

# dimensions of our images.
# ADNI GM
# X: 121*145
# Y: 121*121
# Z: 145*121

# OASIS GM MRI
# 176*208
### data_fold_01_train_val_test_entropy_keep_SliceNum_33
img_width, img_height = 299, 299
fold_name = "fold_01"  ## data_fold_01_entropy_keep_SliceNum_33
## single_subject_data_fold_01_train_val_test_entropy_keep_SliceNum_81
train_data_dir = 'single_subject_data_' + fold_name + '_train_val_test_entropy_keep_SliceNum_81/train'
validation_data_dir = 'single_subject_data_' + fold_name + '_train_val_test_entropy_keep_SliceNum_81/validation'
filepath="model_single_subject_InceptionV4_" + fold_name + "_train_val_test_entropy_keep_SliceNum_81_best.h5"

# train num (AD+NC) = 36207 + 41796 = 78003
# validation num (AD+NC) = 9477 + 11178 = 20655
# test num (AD+NC) = 2673 + 2916 =
# train_samples_AD =  len(os.listdir(path))
nb_train_samples = 78003
nb_validation_samples = 20655
epochs = 120
batch_size = 64 #10 #40
channel = 3
num_classes = 2

print("=== paramaters info ===")
print("epochs = {}.".format(epochs))
print("batch_size = {}.".format(batch_size))
print("nb_train_samples = {}.".format(nb_train_samples))
print("nb_validation_samples = {}.".format(nb_validation_samples))

#if K.image_data_format() == 'channels_first':
#	input_shape = (3, img_width, img_height)
#else:
#	input_shape = (img_width, img_height, 3)

# this is the augmentation configuration we will use for training
train_datagen = ImageDataGenerator(
rescale=1. / 255,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True)

# this is the augmentation configuration we will use for testing:
# only rescaling
test_datagen = ImageDataGenerator(rescale=1. / 255)

### class_mode: "categorical", "binary", "sparse"或None之一.
### 默认为"categorical: 该参数决定了返回的标签数组的形式, "categorical"会返回2D的one-hot编码标签,
### "binary"返回1D的二值标签.
### "sparse"返回1D的整数标签,
### 如果为None则不返回任何标签, 生成器将仅仅生成batch数据, 这种情况在使用model.predict_generator()和model.evaluate_generator()等函数时会用到.

train_generator = train_datagen.flow_from_directory(
train_data_dir,
target_size=(img_width, img_height),
batch_size=batch_size,
class_mode='binary')

validation_generator = test_datagen.flow_from_directory(
validation_data_dir,
target_size=(img_width, img_height),
batch_size=batch_size,
class_mode='binary')

# Load our model
model = inception_v4_model(img_height, img_width, channel, num_classes, dropout_keep_prob=0.2)
parallel_model = multi_gpu_model(model, gpus=2)
# Learning rate is changed to 0.001
sgd = optimizers.SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
parallel_model.compile(optimizer=sgd, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

checkpoint = keras.callbacks.ModelCheckpoint(
filepath = filepath,
monitor='val_acc',
verbose=1,
save_best_only=True,
# save_weights_only=False,
mode='max',
period=1
)
callbacks_list = [checkpoint]

### verbose：日志显示，0为不在标准输出流输出日志信息，1为输出进度条记录，2为每个epoch输出一行记录
###
parallel_model.fit_generator(
train_generator,
steps_per_epoch=nb_train_samples/batch_size,
epochs=epochs,
verbose = 2,
validation_data=validation_generator,
validation_steps=nb_validation_samples/batch_size,
callbacks = callbacks_list)
#validation_steps=nb_validation_samples // batch_size)

# model.save('InceptionV4_model_fold_01.h5')

# Make predictions
#predictions_valid = model.predict(X_valid, batch_size=batch_size, verbose=1)

# Cross-entropy loss score
#score = log_loss(Y_valid, predictions_valid)

### CUDA_VISIBLE_DEVICES=0 python inception_v4_train_val_test_entropy_keep_SliceNum_81_fold_01_single_subject.py > acc_inception_v4_train_val_test_entropy_keep_SliceNum_81_fold_01_single_subject.txt
### python inception_v4_train_val_test_entropy_keep_SliceNum_81_fold_01_single_subject.py > acc_single_subject_inception_v4_train_val_test_entropy_keep_SliceNum_81_fold_01.txt

注意：

上述代码使用

# parallel_model.fit_generator(
# train_generator,
# steps_per_epoch=nb_train_samples/batch_size,
# epochs=epochs,
# verbose = 2,
# validation_data=validation_generator,
# validation_steps=nb_validation_samples/batch_size,
# callbacks = callbacks_list)
会报错：

TypeError: can't pickle NotImplementedType objects

去掉 callbacks 即可，如下所示：
parallel_model.fit_generator(
train_generator,
steps_per_epoch=nb_train_samples/batch_size,
epochs=epochs,
verbose = 2,
validation_data=validation_generator,
validation_steps=nb_validation_samples/batch_size)

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： keras 多显卡数据并行

相关文章推荐

新的分享

章节导航