您的位置:首页 > 其它

tensorflow 分布式 数据并行 异步训练 between-graph 自己写的实例 CNN

2017-02-10 13:48 806 查看
# 通用的数据并行还是推荐between-graph的,因为in-graph要自己归并cost之类的

# 10.100.203.75这台运行
#python test_dis2.py --job_name=worker --ps_hosts=10.100.203.75:1111 --worker_hosts=10.100.206.209:2222,10.100.203.75:2223 --task_id=1
#python test_dis2.py --job_name=ps --ps_hosts=10.100.203.75:1111 --worker_hosts=10.100.206.209:2222,10.100.203.75:2223 --task_id=0

# 10.100.206.209这台运行
#python test_dis2.py --job_name=worker --ps_hosts=10.100.203.75:1111 --worker_hosts=10.100.206.209:2222,10.100.203.75:2223 --task_id=0

# 注意task_id和在哪台机器上启动--job_name=worker脚本的--worker_hosts顺序对应
# 感觉最好应该先运行ps再运行2个worker,最后是在两个worker都有打印训练过程

import tensorflow as tf

FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('job_name', '', 'One of "ps", "worker"')
tf.app.flags.DEFINE_string('ps_hosts', '',
"""Comma-separated list of hostname:port for the """
"""parameter server jobs. e.g. """
"""'machine1:2222,machine2:1111,machine2:2222'""")
tf.app.flags.DEFINE_string('worker_hosts', '',
"""Comma-separated list of hostname:port for the """
"""worker jobs. e.g. """
"""'machine1:2222,machine2:1111,machine2:2222'""")
tf.app.flags.DEFINE_integer(
'task_id', 0, 'Task id of the replica running the training.')

ps_hosts = FLAGS.ps_hosts.split(',')
worker_hosts = FLAGS.worker_hosts.split(',')
cluster_spec = tf.train.ClusterSpec({'ps': ps_hosts,'worker': worker_hosts})
server = tf.train.Server(
{'ps': ps_hosts,'worker': worker_hosts},
job_name=FLAGS.job_name,
task_index=FLAGS.task_id)

print("!!!!")
if FLAGS.job_name == 'ps':
server.join()
print("!!!!")

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("./", one_hot=True)#MNIST的四个.gz文件

learning_rate = 0.001
training_iters = 200000
batch_size = 128
display_step = 10

n_input = 784
n_classes = 10
dropout = 0.75

def conv2d(x, W, b, strides=1):
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)

def maxpool2d(x, k=2):
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
padding='SAME')

def conv_net(x, weights, biases, dropout):

x = tf.reshape(x, shape=[-1, 28, 28, 1])

conv1 = conv2d(x, weights['wc1'], biases['bc1'])
# Max Pooling (down-sampling)
conv1 = maxpool2d(conv1, k=2)

conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])

conv2 = maxpool2d(conv2, k=2)

# Fully connected layer
# Reshape conv2 output to fit fully connected layer input
fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
fc1 = tf.nn.relu(fc1)

fc1 = tf.nn.dropout(fc1, dropout)

out = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
return out

with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d" % FLAGS.task_id,
cluster=cluster_spec)):
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)

weights = {
# 5x5 conv, 1 input, 32 outputs
'wc1': tf.Variable(tf.random_normal([5, 5, 1, 32])),
# 5x5 conv, 32 inputs, 64 outputs
'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])),
# fully connected, 7*7*64 inputs, 1024 outputs
'wd1': tf.Variable(tf.random_normal([7*7*64, 1024])),
# 1024 inputs, 10 outputs (class prediction)
'out': tf.Variable(tf.random_normal([1024, n_classes]))
}

biases = {
'bc1': tf.Variable(tf.random_normal([32])),
'bc2': tf.Variable(tf.random_normal([64])),
'bd1': tf.Variable(tf.random_normal([1024])),
'out': tf.Variable(tf.random_normal([n_classes]))
}

pred = conv_net(x, weights, biases, keep_prob)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

global_step = tf.Variable(0, name='global_step', trainable=False)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
tf.scalar_summary('cost', cost)
summary_op = tf.merge_all_summaries()

sv = tf.train.Supervisor(is_chief=(FLAGS.task_id == 0),
logdir="C:\\Users\\guotong1\\Desktop\\checkpoint",
init_op=init,
summary_op=None,
saver=saver,
global_step=global_step,
save_model_secs=60)
# Launch the graph
#相当于两个worker都启动session,server.target就是那台机器的localhost
with sv.managed_session(server.target) as sess:
sess.run(init)
step = 1

while step * batch_size < training_iters:
batch_x, batch_y = mnist.train.next_batch(batch_size)

sess.run(optimizer, feed_dict={x: batch_x, y: batch_y,
keep_prob: dropout})
if step % display_step == 0:

loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x,
y: batch_y,
keep_prob: 1.})
print("Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
"{:.6f}".format(loss) + ", Training Accuracy= " + \
"{:.5f}".format(acc))
step += 1
print("Optimization Finished!")

# Calculate accuracy for 256 mnist test images
print("Testing Accuracy:", \
sess.run(accuracy, feed_dict={x: mnist.test.images[:256],
y: mnist.test.labels[:256],
keep_prob: 1.}))
sv.stop()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: