Tensorflow Deep MNIST: Resource exhausted: OOM when allocating tensor with shape[10000,32,28,28]

原创

2017年07月04日 16:51:42

        <ul class="article_tags clearfix csdn-tracking-statistics tracking-click" data-mod="popu_377">
            <li class="tit">标签：</li>

Deep Learning

/
tensorflow

/
oom

/
- 4283
- 编辑
- 删除
今天测试卷积神经网络报了如题所示的错误，我跑的代码如下。
[python]

view plain

copy

print

?
1. from
  
  tensorflow.examples.tutorials.mnist
  
  import
  
  input_data
2. mnist = input_data.read_data_sets(
  
  ’MNIST_data’
  
  , one_hot=
  
  True
  
  )
4. import
  
  tensorflow as tf
5. sess = tf.InteractiveSession()
7. x = tf.placeholder(tf.float32, shape=[
  
  None
  
  ,
  
  784
  
  ])
8. y_ = tf.placeholder(tf.float32, shape=[
  
  None
  
  ,
  
  10
  
  ])
10. W = tf.Variable(tf.zeros([
  
  784
  
  ,
  
  10
  
  ]))
11. b = tf.Variable(tf.zeros([
  
  10
  
  ]))
13. y = tf.nn.softmax(tf.matmul(x,W) + b)
15. def
  
  weight_variable(shape):
16. initial = tf.truncated_normal(shape, stddev=
  
  0.1
  
  )
17. return
  
  tf.Variable(initial)
19. def
  
  bias_variable(shape):
20. initial = tf.constant(
  
  0.1
  
  , shape=shape)
21. return
  
  tf.Variable(initial)
24. def
  
  conv2d(x, W):
25. return
  
  tf.nn.conv2d(x, W, strides=[
  
  1
  
  ,
  
  1
  
  ,
  
  1
  
  ,
  
  1
  
  ], padding=
  
  ‘SAME’
  
  )
27. def
  
  max_pool_2x2(x):
28. return
  
  tf.nn.max_pool(x, ksize=[
  
  1
  
  ,
  
  2
  
  ,
  
  2
  
  ,
  
  1
  
  ],
29. strides=[
  
  1
  
  ,
  
  2
  
  ,
  
  2
  
  ,
  
  1
  
  ], padding=
  
  ‘SAME’
  
  )
32. W_conv1 = weight_variable([
  
  5
  
  ,
  
  5
  
  ,
  
  1
  
  ,
  
  32
  
  ])
33. b_conv1 = bias_variable([
  
  32
  
  ])
36. x_image = tf.reshape(x, [-
  
  1
  
  ,
  
  28
  
  ,
  
  28
  
  ,
  
  1
  
  ])
38. h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
39. h_pool1 = max_pool_2x2(h_conv1)
41. W_conv2 = weight_variable([
  
  5
  
  ,
  
  5
  
  ,
  
  32
  
  ,
  
  64
  
  ])
42. b_conv2 = bias_variable([
  
  64
  
  ])
44. h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
45. h_pool2 = max_pool_2x2(h_conv2)
47. W_fc1 = weight_variable([
  
  7
  
  *
  
  7
  
  *
  
  64
  
  ,
  
  1024
  
  ])
48. b_fc1 = bias_variable([
  
  1024
  
  ])
50. h_pool2_flat = tf.reshape(h_pool2, [-
  
  1
  
  ,
  
  7
  
  *
  
  7
  
  *
  
  64
  
  ])
51. h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
53. keep_prob = tf.placeholder(tf.float32)
54. h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
56. W_fc2 = weight_variable([
  
  1024
  
  ,
  
  10
  
  ])
57. b_fc2 = bias_variable([
  
  10
  
  ])
59. y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
61. cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv), reduction_indices=[
  
  1
  
  ]))
62. train_step = tf.train.AdamOptimizer(
  
  1e
  
  –
  
  4
  
  ).minimize(cross_entropy)
63. correct_prediction = tf.equal(tf.argmax(y_conv,
  
  1
  
  ), tf.argmax(y_,
  
  1
  
  ))
64. accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
66. init = tf.initialize_all_variables()
67. config = tf.ConfigProto()
68. config.gpu_options.allocator_type =
  
  ’BFC’
69. with tf.Session(config = config) as s:
70. sess.run(init)
72. for
  
  i
  
  in
  
  range(
  
  20000
  
  ):
73. batch = mnist.train.next_batch(
  
  50
  
  )
74. if
  
  i%
  
  100
  
  ==
  
  0
  
  :
75. train_accuracy = accuracy.eval(feed_dict={
76. x:batch[
  
  0
  
  ], y_: batch[
  
  1
  
  ], keep_prob:
  
  1.0
  
  })
77. print
  
  (
  
  “step %d, training accuracy %g”
  
  %(i, train_accuracy))
78. train_step.run(feed_dict={x: batch[
  
  0
  
  ], y_: batch[
  
  1
  
  ], keep_prob:
  
  0.5
  
  })
80. print
  
  (
  
  “test accuracy %g”
  
  %accuracy.eval(feed_dict={
81. x: mnist.test.images, y_: mnist.test.labels, keep_prob:
  
  1.0
  
  }))
```
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)

import tensorflow as tf
sess = tf.InteractiveSession()

x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])

W = tf.Variable(tf.zeros([784,10]))
b = tf.Variable(tf.zeros([10]))

y = tf.nn.softmax(tf.matmul(x,W) + b)

def weight_variable(shape):
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial)

def bias_variable(shape):
  initial = tf.constant(0.1, shape=shape)
  return tf.Variable(initial)


def conv2d(x, W):
  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
  return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')


W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])


x_image = tf.reshape(x, [-1,28,28,1])

h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc1 = bias_variable([1024])

h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])

y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)

cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv), reduction_indices=[1]))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

init = tf.initialize_all_variables()
config = tf.ConfigProto()
config.gpu_options.allocator_type = 'BFC'
with tf.Session(config = config) as s:
  sess.run(init)

for i in range(20000):
  batch = mnist.train.next_batch(50)
  if i%100 == 0:
    train_accuracy = accuracy.eval(feed_dict={
        x:batch[0], y_: batch[1], keep_prob: 1.0})
    print("step %d, training accuracy %g"%(i, train_accuracy))
  train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})

print("test accuracy %g"%accuracy.eval(feed_dict={
    x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
```
然后错误大致如下
[python]

view plain

copy

print

?
1. W tensorflow/core/common_runtime/bfc_allocator.cc:
  
  270
  
  ] **********************************************************xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
2. W tensorflow/core/common_runtime/bfc_allocator.cc:
  
  271
  
  ] Ran out of memory trying to allocate
  
  957.03MiB
  
  . See logs
  
  for
  
  memory state.
3. W tensorflow/core/framework/op_kernel.cc:
  
  968
  
  ] Resource exhausted: OOM when allocating tensor with shape[
  
  10000
  
  ,
  
  32
  
  ,
  
  28
  
  ,
  
  28
  
  ]
4. Traceback (most recent call last):
5. File
  
  ”trainer_deepMnist.py”
  
  , line
  
  109
  
  ,
  
  in
  
  <module>
6. x: mnist.test.images, y_: mnist.test.labels, keep_prob:
  
  1.0
  
  }))
7. File
  
  ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py”
  
  , line
  
  559
  
  ,
  
  in
  
  eval
8. return
  
  _eval_using_default_session(
  
  self
  
  , feed_dict,
  
  self
  
  .graph, session)
9. File
  
  ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py”
  
  , line
  
  3648
  
  ,
  
  in
  
  _eval_using_default_session
10. return
  
  session.run(tensors, feed_dict)
11. File
  
  ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py”
  
  , line
  
  710
  
  ,
  
  in
  
  run
12. run_metadata_ptr)
13. File
  
  ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py”
  
  , line
  
  908
  
  ,
  
  in
  
  _run
14. feed_dict_string, options, run_metadata)
15. File
  
  ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py”
  
  , line
  
  958
  
  ,
  
  in
  
  _do_run
16. target_list, options, run_metadata)
17. File
  
  ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py”
  
  , line
  
  978
  
  ,
  
  in
  
  _do_call
18. raise
  
  type(e)(node_def, op, message)
19. tensorflow.python.framework.errors.ResourceExhaustedError: OOM when allocating tensor with shape[
  
  10000
  
  ,
  
  32
  
  ,
  
  28
  
  ,
  
  28
  
  ]
20. [[Node: Conv2D = Conv2D[T=DT_FLOAT, data_format=
  
  ”NHWC”
  
  , padding=
  
  “SAME”
  
  , strides=[
  
  1
  
  ,
  
  1
  
  ,
  
  1
  
  ,
  
  1
  
  ], use_cudnn_on_gpu=true, _device=
  
  “/job:localhost/replica:0/task:0/gpu:0”
  
  ](Reshape, Variable_2/read)]]
21. Caused by op u
  
  ’Conv2D’
  
  , defined at:
22. File
  
  ”trainer_deepMnist.py”
  
  , line
  
  61
  
  ,
  
  in
  
  <module>
23. h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
24. File
  
  ”trainer_deepMnist.py”
  
  , line
  
  46
  
  ,
  
  in
  
  conv2d
25. return
  
  tf.nn.conv2d(x, W, strides=[
  
  1
  
  ,
  
  1
  
  ,
  
  1
  
  ,
  
  1
  
  ], padding=
  
  ‘SAME’
  
  )
26. File
  
  ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_nn_ops.py”
  
  , line
  
  394
  
  ,
  
  in
  
  conv2d
27. data_format=data_format, name=name)
28. File
  
  ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py”
  
  , line
  
  703
  
  ,
  
  in
  
  apply_op
29. op_def=op_def)
30. File
  
  ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py”
  
  , line
  
  2320
  
  ,
  
  in
  
  create_op
31. original_op=
  
  self
  
  ._default_original_op, op_def=op_def)
32. File
  
  ”/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py”
  
  , line
  
  1239
  
  ,
  
  in
  
  __init__
33. self
  
  ._traceback = _extract_stack()
```
W tensorflow/core/common_runtime/bfc_allocator.cc:270] **********************************************************xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
W tensorflow/core/common_runtime/bfc_allocator.cc:271] Ran out of memory trying to allocate 957.03MiB.  See logs for memory state.
W tensorflow/core/framework/op_kernel.cc:968] Resource exhausted: OOM when allocating tensor with shape[10000,32,28,28]
Traceback (most recent call last):
  File "trainer_deepMnist.py", line 109, in <module>
    x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 559, in eval
    return _eval_using_default_session(self, feed_dict, self.graph, session)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 3648, in _eval_using_default_session
    return session.run(tensors, feed_dict)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 710, in run
    run_metadata_ptr)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 908, in _run
    feed_dict_string, options, run_metadata)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 958, in _do_run
    target_list, options, run_metadata)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 978, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors.ResourceExhaustedError: OOM when allocating tensor with shape[10000,32,28,28]
     [[Node: Conv2D = Conv2D[T=DT_FLOAT, data_format="NHWC", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](Reshape, Variable_2/read)]]
Caused by op u'Conv2D', defined at:
  File "trainer_deepMnist.py", line 61, in <module>
    h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
  File "trainer_deepMnist.py", line 46, in conv2d
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_nn_ops.py", line 394, in conv2d
    data_format=data_format, name=name)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 703, in apply_op
    op_def=op_def)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2320, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1239, in __init__
    self._traceback = _extract_stack()
```
原因是GPU OOM，没法分配那么多显存来搞定accuracy evaluation，因此需要改成批处理。

TensorFlow给出的原因解释：

Here is how I solved this problem: the error means that the GPU runs out of memory during accuracy evaluation. Hence it needs a smaller sized dataset, which can be achieved by using data in batches. So, instead of running the code on the whole test dataset it needs to be run in batches.

解决方案：把最后那句print换成我这的三行，分批print，就没问题了。
[python]

view plain

copy

print

?
1. for
  
  i
  
  in
  
  xrange(
  
  10
  
  ):
2. testSet = mnist.test.next_batch(
  
  50
  
  )
3. print
  
  (
  
  “test accuracy %g”
  
  %accuracy.eval(feed_dict={ x: testSet[
  
  0
  
  ], y_: testSet[
  
  1
  
  ], keep_prob:
  
  1.0
  
  }))
```
for i in xrange(10):          若报错 可将xrange改为range
    testSet = mnist.test.next_batch(50)
    print("test accuracy %g"%accuracy.eval(feed_dict={ x: testSet[0], y_: testSet[1], keep_prob: 1.0}))
```
tensorflow搬运工：

https://stackoverflow.com/questions/39076388/tensorflow-deep-mnist-resource-exhausted-oom-when-alloc
```
            </div>
                </div>
    <script>
    $(".MathJax").remove();
    </script>
</article>
```

Tensorflow Deep MNIST: Resource exhausted: OOM when allocating tensor with shape[10000,32,28,28]

你可能也喜欢