Tensorflow Deep MNIST: Resource exhausted: OOM when allocating tensor with shape[10000,32,28,28]

  • Post author:
  • Post category:其他


Tensorflow Deep MNIST: Resource exhausted: OOM when allocating tensor with shape[10000,32,28,28]




原创



2017年07月04日 16:51:42

        <ul class="article_tags clearfix csdn-tracking-statistics tracking-click" data-mod="popu_377">
            <li class="tit">标签:</li>


  • Deep Learning

    /

  • tensorflow

    /

  • oom

    /
  • 今天测试卷积神经网络报了如题所示的错误,我跑的代码如下。


    1. from

      tensorflow.examples.tutorials.mnist

      import

      input_data
    2. mnist = input_data.read_data_sets(

      ’MNIST_data’

      , one_hot=

      True

      )

    3. import

      tensorflow as tf
    4. sess = tf.InteractiveSession()
    5. x = tf.placeholder(tf.float32, shape=[

      None

      ,

      784

      ])
    6. y_ = tf.placeholder(tf.float32, shape=[

      None

      ,

      10

      ])
    7. W = tf.Variable(tf.zeros([

      784

      ,

      10

      ]))
    8. b = tf.Variable(tf.zeros([

      10

      ]))
    9. y = tf.nn.softmax(tf.matmul(x,W) + b)

    10. def

      weight_variable(shape):
    11. initial = tf.truncated_normal(shape, stddev=

      0.1

      )

    12. return

      tf.Variable(initial)

    13. def

      bias_variable(shape):
    14. initial = tf.constant(

      0.1

      , shape=shape)

    15. return

      tf.Variable(initial)

    16. def

      conv2d(x, W):

    17. return

      tf.nn.conv2d(x, W, strides=[

      1

      ,

      1

      ,

      1

      ,

      1

      ], padding=

      ‘SAME’

      )

    18. def

      max_pool_2x2(x):

    19. return

      tf.nn.max_pool(x, ksize=[

      1

      ,

      2

      ,

      2

      ,

      1

      ],
    20. strides=[

      1

      ,

      2

      ,

      2

      ,

      1

      ], padding=

      ‘SAME’

      )
    21. W_conv1 = weight_variable([

      5

      ,

      5

      ,

      1

      ,

      32

      ])
    22. b_conv1 = bias_variable([

      32

      ])
    23. x_image = tf.reshape(x, [-

      1

      ,

      28

      ,

      28

      ,

      1

      ])
    24. h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
    25. h_pool1 = max_pool_2x2(h_conv1)
    26. W_conv2 = weight_variable([

      5

      ,

      5

      ,

      32

      ,

      64

      ])
    27. b_conv2 = bias_variable([

      64

      ])
    28. h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
    29. h_pool2 = max_pool_2x2(h_conv2)
    30. W_fc1 = weight_variable([

      7

      *

      7

      *

      64

      ,

      1024

      ])
    31. b_fc1 = bias_variable([

      1024

      ])
    32. h_pool2_flat = tf.reshape(h_pool2, [-

      1

      ,

      7

      *

      7

      *

      64

      ])
    33. h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
    34. keep_prob = tf.placeholder(tf.float32)
    35. h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
    36. W_fc2 = weight_variable([

      1024

      ,

      10

      ])
    37. b_fc2 = bias_variable([

      10

      ])
    38. y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
    39. cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv), reduction_indices=[

      1

      ]))
    40. train_step = tf.train.AdamOptimizer(

      1e



      4

      ).minimize(cross_entropy)
    41. correct_prediction = tf.equal(tf.argmax(y_conv,

      1

      ), tf.argmax(y_,

      1

      ))
    42. accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    43. init = tf.initialize_all_variables()
    44. config = tf.ConfigProto()
    45. config.gpu_options.allocator_type =

      ’BFC’
    46. with tf.Session(config = config) as s:
    47. sess.run(init)

    48. for

      i

      in

      range(

      20000

      ):
    49. batch = mnist.train.next_batch(

      50

      )

    50. if

      i%

      100

      ==

      0

      :
    51. train_accuracy = accuracy.eval(feed_dict={
    52. x:batch[

      0

      ], y_: batch[

      1

      ], keep_prob:

      1.0

      })

    53. print

      (

      “step %d, training accuracy %g”

      %(i, train_accuracy))
    54. train_step.run(feed_dict={x: batch[

      0

      ], y_: batch[

      1

      ], keep_prob:

      0.5

      })

    55. print

      (

      “test accuracy %g”

      %accuracy.eval(feed_dict={
    56. x: mnist.test.images, y_: mnist.test.labels, keep_prob:

      1.0

      }))
    from tensorflow.examples.tutorials.mnist import input_data
    mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
    
    import tensorflow as tf
    sess = tf.InteractiveSession()
    
    x = tf.placeholder(tf.float32, shape=[None, 784])
    y_ = tf.placeholder(tf.float32, shape=[None, 10])
    
    W = tf.Variable(tf.zeros([784,10]))
    b = tf.Variable(tf.zeros([10]))
    
    y = tf.nn.softmax(tf.matmul(x,W) + b)
    
    def weight_variable(shape):
      initial = tf.truncated_normal(shape, stddev=0.1)
      return tf.Variable(initial)
    
    def bias_variable(shape):
      initial = tf.constant(0.1, shape=shape)
      return tf.Variable(initial)
    
    
    def conv2d(x, W):
      return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
    
    def max_pool_2x2(x):
      return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                            strides=[1, 2, 2, 1], padding='SAME')
    
    
    W_conv1 = weight_variable([5, 5, 1, 32])
    b_conv1 = bias_variable([32])
    
    
    x_image = tf.reshape(x, [-1,28,28,1])
    
    h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
    h_pool1 = max_pool_2x2(h_conv1)
    
    W_conv2 = weight_variable([5, 5, 32, 64])
    b_conv2 = bias_variable([64])
    
    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
    h_pool2 = max_pool_2x2(h_conv2)
    
    W_fc1 = weight_variable([7 * 7 * 64, 1024])
    b_fc1 = bias_variable([1024])
    
    h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
    h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
    
    keep_prob = tf.placeholder(tf.float32)
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
    
    W_fc2 = weight_variable([1024, 10])
    b_fc2 = bias_variable([10])
    
    y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
    
    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv), reduction_indices=[1]))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    init = tf.initialize_all_variables()
    config = tf.ConfigProto()
    config.gpu_options.allocator_type = 'BFC'
    with tf.Session(config = config) as s:
      sess.run(init)
    
    for i in range(20000):
      batch = mnist.train.next_batch(50)
      if i%100 == 0:
        train_accuracy = accuracy.eval(feed_dict={
            x:batch[0], y_: batch[1], keep_prob: 1.0})
        print("step %d, training accuracy %g"%(i, train_accuracy))
      train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
    
    print("test accuracy %g"%accuracy.eval(feed_dict={
        x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))

    然后错误大致如下

    1. W tensorflow/core/common_runtime/bfc_allocator.cc:

      270

      ] **********************************************************xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
    2. W tensorflow/core/common_runtime/bfc_allocator.cc:

      271

      ] Ran out of memory trying to allocate

      957.03MiB

      .  See logs

      for

      memory state.
    3. W tensorflow/core/framework/op_kernel.cc:

      968

      ] Resource exhausted: OOM when allocating tensor with shape[

      10000

      ,

      32

      ,

      28

      ,

      28

      ]
    4. Traceback (most recent call last):
    5. File

      ”trainer_deepMnist.py”

      , line

      109

      ,

      in

      <module>
    6. x: mnist.test.images, y_: mnist.test.labels, keep_prob:

      1.0

      }))
    7. File

      ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py”

      , line

      559

      ,

      in

      eval

    8. return

      _eval_using_default_session(

      self

      , feed_dict,

      self

      .graph, session)
    9. File

      ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py”

      , line

      3648

      ,

      in

      _eval_using_default_session

    10. return

      session.run(tensors, feed_dict)
    11. File

      ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py”

      , line

      710

      ,

      in

      run
    12. run_metadata_ptr)
    13. File

      ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py”

      , line

      908

      ,

      in

      _run
    14. feed_dict_string, options, run_metadata)
    15. File

      ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py”

      , line

      958

      ,

      in

      _do_run
    16. target_list, options, run_metadata)
    17. File

      ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py”

      , line

      978

      ,

      in

      _do_call

    18. raise

      type(e)(node_def, op, message)
    19. tensorflow.python.framework.errors.ResourceExhaustedError: OOM when allocating tensor with shape[

      10000

      ,

      32

      ,

      28

      ,

      28

      ]
    20. [[Node: Conv2D = Conv2D[T=DT_FLOAT, data_format=

      ”NHWC”

      , padding=

      “SAME”

      , strides=[

      1

      ,

      1

      ,

      1

      ,

      1

      ], use_cudnn_on_gpu=true, _device=

      “/job:localhost/replica:0/task:0/gpu:0”

      ](Reshape, Variable_2/read)]]
    21. Caused by op u

      ’Conv2D’

      , defined at:
    22. File

      ”trainer_deepMnist.py”

      , line

      61

      ,

      in

      <module>
    23. h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
    24. File

      ”trainer_deepMnist.py”

      , line

      46

      ,

      in

      conv2d

    25. return

      tf.nn.conv2d(x, W, strides=[

      1

      ,

      1

      ,

      1

      ,

      1

      ], padding=

      ‘SAME’

      )
    26. File

      ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_nn_ops.py”

      , line

      394

      ,

      in

      conv2d
    27. data_format=data_format, name=name)
    28. File

      ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py”

      , line

      703

      ,

      in

      apply_op
    29. op_def=op_def)
    30. File

      ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py”

      , line

      2320

      ,

      in

      create_op
    31. original_op=

      self

      ._default_original_op, op_def=op_def)
    32. File

      ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py”

      , line

      1239

      ,

      in

      __init__

    33. self

      ._traceback = _extract_stack()
    W tensorflow/core/common_runtime/bfc_allocator.cc:270] **********************************************************xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
    W tensorflow/core/common_runtime/bfc_allocator.cc:271] Ran out of memory trying to allocate 957.03MiB.  See logs for memory state.
    W tensorflow/core/framework/op_kernel.cc:968] Resource exhausted: OOM when allocating tensor with shape[10000,32,28,28]
    Traceback (most recent call last):
      File "trainer_deepMnist.py", line 109, in <module>
        x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 559, in eval
        return _eval_using_default_session(self, feed_dict, self.graph, session)
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 3648, in _eval_using_default_session
        return session.run(tensors, feed_dict)
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 710, in run
        run_metadata_ptr)
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 908, in _run
        feed_dict_string, options, run_metadata)
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 958, in _do_run
        target_list, options, run_metadata)
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 978, in _do_call
        raise type(e)(node_def, op, message)
    tensorflow.python.framework.errors.ResourceExhaustedError: OOM when allocating tensor with shape[10000,32,28,28]
         [[Node: Conv2D = Conv2D[T=DT_FLOAT, data_format="NHWC", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](Reshape, Variable_2/read)]]
    Caused by op u'Conv2D', defined at:
      File "trainer_deepMnist.py", line 61, in <module>
        h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
      File "trainer_deepMnist.py", line 46, in conv2d
        return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_nn_ops.py", line 394, in conv2d
        data_format=data_format, name=name)
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 703, in apply_op
        op_def=op_def)
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2320, in create_op
        original_op=self._default_original_op, op_def=op_def)
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1239, in __init__
        self._traceback = _extract_stack()

    原因是GPU OOM,没法分配那么多显存来搞定accuracy evaluation,因此需要改成批处理。

    TensorFlow给出的原因解释:

    Here is how I solved this problem: the error means that the GPU runs out of memory during accuracy evaluation. Hence it needs a smaller sized dataset, which can be achieved by using data in batches. So, instead of running the code on the whole test dataset it needs to be run in batches.

    解决方案:把最后那句print换成我这的三行,分批print,就没问题了。


    1. for

      i

      in

      xrange(

      10

      ):
    2. testSet = mnist.test.next_batch(

      50

      )

    3. print

      (

      “test accuracy %g”

      %accuracy.eval(feed_dict={ x: testSet[

      0

      ], y_: testSet[

      1

      ], keep_prob:

      1.0

      }))
    for i in xrange(10):          若报错 可将xrange改为range
        testSet = mnist.test.next_batch(50)
        print("test accuracy %g"%accuracy.eval(feed_dict={ x: testSet[0], y_: testSet[1], keep_prob: 1.0}))

    tensorflow搬运工:

    https://stackoverflow.com/questions/39076388/tensorflow-deep-mnist-resource-exhausted-oom-when-alloc

                </div>
                    </div>
        <script>
        $(".MathJax").remove();
        </script>
    </article>