tensorflow 分布式 数据并行 异步训练 between-graph 自己写的实例 RNN
2017-02-10 17:47
549 查看
#运行方法见上两篇文章 import tensorflow as tf FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('job_name', '', 'One of "ps", "worker"') tf.app.flags.DEFINE_string('ps_hosts', '', """Comma-separated list of hostname:port for the """ """parameter server jobs. e.g. """ """'machine1:2222,machine2:1111,machine2:2222'""") tf.app.flags.DEFINE_string('worker_hosts', '', """Comma-separated list of hostname:port for the """ """worker jobs. e.g. """ """'machine1:2222,machine2:1111,machine2:2222'""") tf.app.flags.DEFINE_integer( 'task_id', 0, 'Task id of the replica running the training.') ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') cluster_spec = tf.train.ClusterSpec({'ps': ps_hosts,'worker': worker_hosts}) server = tf.train.Server( {'ps': ps_hosts,'worker': worker_hosts}, job_name=FLAGS.job_name, task_index=FLAGS.task_id) print("!!!!") if FLAGS.job_name == 'ps': server.join() print("!!!!") from tensorflow.examples.tutorials.mnist import input_data mnist = input_data.read_data_sets("./", one_hot=True) # Parameters learning_rate = 0.001 training_iters = 100000 batch_size = 128 display_step = 10 # Network Parameters n_input = 28 # MNIST data input (img shape: 28*28) n_steps = 28 # timesteps n_hidden = 128 # hidden layer num of features n_classes = 10 # MNIST total classes (0-9 digits) def RNN(x, weights, biases): # Prepare data shape to match `rnn` function requirements # Current data input shape: (batch_size, n_steps, n_input) # Required shape: 'n_steps' tensors list of shape (batch_size, n_input) # Permuting batch_size and n_steps x = tf.transpose(x, [1, 0, 2]) # Reshaping to (n_steps*batch_size, n_input) x = tf.reshape(x, [-1, n_input]) # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input) x = tf.split(0, n_steps, x) # Define a lstm cell with tensorflow lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0) # Get lstm cell output outputs, states = tf.nn.rnn(lstm_cell, x, dtype=tf.float32) # Linear activation, using rnn inner loop last output return tf.matmul(outputs[-1], weights['out']) + biases['out'] with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_id, cluster=cluster_spec)): # tf Graph input x = tf.placeholder("float", [None, n_steps, n_input]) y = tf.placeholder("float", [None, n_classes]) # Define weights weights = { 'out': tf.Variable(tf.random_normal([n_hidden, n_classes])) } biases = { 'out': tf.Variable(tf.random_normal([n_classes])) } pred = RNN(x, weights, biases) # Define loss and optimizer cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # Evaluate model correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # Initializing the variables # Initializing the variables global_step = tf.Variable(0, name='global_step', trainable=False) init = tf.global_variables_initializer() saver = tf.train.Saver() tf.scalar_summary('cost', cost) summary_op = tf.merge_all_summaries() sv = tf.train.Supervisor(is_chief=(FLAGS.task_id == 0), logdir="C:\\Users\\guotong1\\Desktop\\checkpoint", init_op=init, summary_op=None, saver=saver, global_step=global_step, save_model_secs=60) # Launch the graph with sv.managed_session(server.target) as sess: sess.run(init) step = 1 # Keep training until reach max iterations while step * batch_size < training_iters: batch_x, batch_y = mnist.train.next_batch(batch_size) # Reshape data to get 28 seq of 28 elements batch_x = batch_x.reshape((batch_size, n_steps, n_input)) # Run optimization op (backprop) sess.run(optimizer, feed_dict={x: batch_x, y: batch_y}) if step % display_step == 0: # Calculate batch accuracy acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y}) # Calculate batch loss loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y}) print("Iter " + str(step*batch_size) + ", Minibatch Loss= " + \ "{:.6f}".format(loss) + ", Training Accuracy= " + \ "{:.5f}".format(acc)) step += 1 print("Optimization Finished!") # Calculate accuracy for 128 mnist test images test_len = 128 test_data = mnist.test.images[:test_len].reshape((-1, n_steps, n_input)) test_label = mnist.test.labels[:test_len] print("Testing Accuracy:", \ sess.run(accuracy, feed_dict={x: test_data, y: test_label})) sv.stop()
相关文章推荐
- tensorflow 分布式 数据并行 异步训练 between-graph 自己写的实例 CNN
- tensorflow 分布式 数据并行 异步训练 between-graph 实例
- tensorflow 分布式 数据并行 同步训练 between-graph 实例
- tensorflow 分布式 数据并行 同步训练 between-graph 实例 (1)
- tensorflow 分布式 数据并行 同步训练 between-graph 实例
- tensorflow 分布式 全局变量 数据并行 同步训练 in-graph 实例
- tensorflow 分布式 数据并行 in-graph 自己写的实例
- tensorflow 分布式 MNIST 实例 ,between-graph,CPU
- 利用tensorflow训练自己的图片数据(2)——输入图片处理
- 利用tensorflow训练自己的图片数据(5)——测试训练网络
- TensorFlow——训练自己的数据——CIFAR10(一)数据准备
- Tensorflow学习笔记:模型训练数据的保存和恢复的简单实例
- 用自己的数据训练Faster-RCNN,tensorflow版本(一)
- 利用tensorflow训练自己的图片数据(4)——神经网络训练
- 配置TensorFlow的objetc_detection api,训练自己的数据
- TensorFlow——训练自己的数据(四)模型测试
- TensorFlow——训练自己的数据(二)模型设计
- Tensorflow 训练自己的数据集(一)(数据直接导入到内存)
- tensorflow对自己的数据进行训练(选择性的恢复权值)(26)---《深度学习》
- 使用Tensorflow训练自己的分割数据