Tensorflow2.0:TFRecord数值数据保存【1】

  • Post author:
  • Post category:其他


1、单组列表数据集的存储

  1. 数据列表:seq
  2. 创建单个特征:tf.train.Feature(

    列表

    )
  3. 创建特征列表:tf.train.FeatureList
  4. 嵌入序列实例:example = tf.train.SequenceExample
  5. 保存:seq_writer = tf.io.TFRecordWriter(“filename”);seq_writer.write(example.SrializeToString())
import tensorflow as tf
#创建单序列数据集
seq = [1,2,3,4,5,6,7,8,9,0]
#创建单个字符feature
features_seq = tf.train.Feature(int64_list=tf.train.Int64List(value=seq))
#创建特征list
feature_lists=tf.train.FeatureLists(feature_list={'features_seq': tf.train.FeatureList(feature=[features_seq])})
#嵌套入example中
example = tf.train.SequenceExample(feature_lists=feature_lists)
seq_writer = tf.io.TFRecordWriter("seq.tfrecord")
seq_writer.write(example.SerializeToString())


注意参数:

Feature的参数是list列表,FeatureList的参数feature_list为字典

2、包含列表的列表数据集的存储

  1. 使用for循环获取所含列表进行存储
import tensorflow as tf
seq_list = [[1,2,3,4,5,6,7,8,9,0],[1,2,3]]
seq_writer = tf.io.TFRecordWriter("seq.tfrecord")
for seq in seq_list:
    #创建单个字符feature
    features_seq = tf.train.Feature(int64_list=tf.train.Int64List(value=seq))
    #创建特征list
    feature_lists=tf.train.FeatureLists(feature_list={'features_seq': tf.train.FeatureList(feature=[features_seq])})
    #嵌套入example中
    example = tf.train.SequenceExample(feature_lists=feature_lists)
    seq_writer.write(example.SerializeToString())


注意参数:

Feature的参数是list列表,FeatureList的参数feature_list为字典,循环嵌入example并写入

3、并列列表数据集的存储

并列与单组列表没什么区别,逐一创建特征Feature并在特征列表FeatureList中进行整合即可

import tensorflow as tf
seq_1 = [1,2,3,4,5,6,7,8,9,0]
seq_2 = [0,9,8,7,6,5,4,3,2,1]
#创建多序列特征
features_seq_1 = tf.train.Feature(int64_list=tf.train.Int64List(value=seq_1))
features_seq_2 = tf.train.Feature(int64_list=tf.train.Int64List(value=seq_2))
#创建特征list
feature_lists=tf.train.FeatureLists(feature_list={
'features_seq': tf.train.FeatureList(feature=[features_seq_1,features_seq_2])
                                         })
#嵌套入example中
example = tf.train.SequenceExample(feature_lists=feature_lists)
seq_writer = tf.io.TFRecordWriter("seq.tfrecord")
seq_writer.write(example.SerializeToString())


注意参数:

Feature的参数是list列表,FeatureList的参数feature_list为字典,字典内部包含数组[features_seq_1,features_seq_2]

4、列表数据集写入与读取

待写入数据如下:

seq_list = [[1, 2, 3], [1, 2], [1, 2, 3], [1, 2], [1, 2, 3], [1, 2], [1, 2, 3], [1, 2]]

首先构建写入函数,注意这里是map逐个元素作Feature,然后构成list(Feature) ->

encoder_smiles_input_feature

def generate_tfrecords(tfrecod_filename,seq_list):
    with tf.io.TFRecordWriter(tfrecod_filename) as f:
        for seq in (seq_list):
            #生成一个Feature列表
            encoder_smiles_input_feature = list(
                                            map(lambda seq_input:tf.train.Feature(int64_list=tf.train.Int64List(value=[seq_input])), seq)
                                            )
            feature_list = tf.train.FeatureLists(feature_list= 
               {'seq_feature':tf.train.FeatureList(feature=encoder_smiles_input_feature)})

            example = tf.train.SequenceExample(feature_lists= feature_list)
            f.write(example.SerializeToString())

构建读取函数,

主要为

tf.io.parse_single_sequence_example

和参数

sequence_features

对于

tf.io.FixedLenSequenceFeature([], dtype=tf.int64)

可用不定长度的

tf.io.VarLenFeature(tf.int64)

替换,此时对shape没有要求,不过输出为

稀疏张量SparseTensor

。否则当shape不对时,会报错:

values size: 10 but output shape: []

,当然可以给参数10来解决,但当成员shape不统一时只能用不定长的了

def single_example_parser(serialized_example):
    sequence_features = {"seq_feature": tf.io.FixedLenSequenceFeature([], dtype=tf.int64)}#按Feature_list的长度提取
    _, sequence_parsed = tf.io.parse_single_sequence_example(serialized=serialized_example,sequence_features=sequence_features)
    seq = sequence_parsed['seq_feature']
    return seq

完整代码如下

import tensorflow as tf
seq_list = [[1, 2, 3], [1, 2], [1, 2, 3], [1, 2], [1, 2, 3], [1, 2], [1, 2, 3], [1, 2]]

def generate_tfrecords(tfrecod_filename,seq_list):
    with tf.io.TFRecordWriter(tfrecod_filename) as f:
        for seq in (seq_list):
            #生成一个Feature列表
            encoder_smiles_input_feature = list(map(lambda seq_input: tf.train.Feature(int64_list=tf.train.Int64List(value=[seq_input])), seq))

            feature_list = tf.train.FeatureLists(feature_list={'seq_feature': tf.train.FeatureList(feature=encoder_smiles_input_feature)})

            example = tf.train.SequenceExample(feature_lists= feature_list)
            f.write(example.SerializeToString())

# def single_example_parser(serialized_example):
#     sequence_features = {"seq_feature": tf.data.FixedLengthRecordDataset([], record_bytes=tf.int64)}#按记录的长度提取
#     _, sequence_parsed = tf.parse_single_sequence_example(serialized=serialized_example,sequence_features=sequence_features)
#     seq = sequence_parsed['seq_feature']
#     return seq

tfrecord_filename = './seq_dataset.tfrecord'	#存储TFRecord数据的地址
generate_tfrecords(tfrecord_filename,seq_list)	#生成TFRecord数据

def single_example_parser(serialized_example):
    sequence_features = {"seq_feature": tf.io.FixedLenSequenceFeature([], dtype=tf.int64)}#按Feature_list的长度提取
    _, sequence_parsed = tf.io.parse_single_sequence_example(serialized=serialized_example,sequence_features=sequence_features)
    seq = sequence_parsed['seq_feature']
    return seq

file_path_list  = tf.data.Dataset.list_files(["./seq_dataset.tfrecord"])
dataset = tf.data.TFRecordDataset(file_path_list)
dataset = dataset.map(lambda x: single_example_parser(x))
for line in dataset:
    print(line)

打印如下:以

tensor格式

输出

tf.Tensor([1 2 3], shape=(3,), dtype=int64)
tf.Tensor([1 2], shape=(2,), dtype=int64)
tf.Tensor([1 2 3], shape=(3,), dtype=int64)
tf.Tensor([1 2], shape=(2,), dtype=int64)
tf.Tensor([1 2 3], shape=(3,), dtype=int64)
tf.Tensor([1 2], shape=(2,), dtype=int64)
tf.Tensor([1 2 3], shape=(3,), dtype=int64)
tf.Tensor([1 2], shape=(2,), dtype=int64)

当我们把sequence_feature换成不定长时,打印如下:以

稀疏张量SparseTensor

格式输出

SparseTensor(indices=tf.Tensor(
[[0 0]
 [1 0]
 [2 0]], shape=(3, 2), dtype=int64), values=tf.Tensor([1 2 3], shape=(3,), dtype=int64), dense_shape=tf.Tensor([3 1], shape=(2,), dtype=int64))
SparseTensor(indices=tf.Tensor(
[[0 0]
 [1 0]], shape=(2, 2), dtype=int64), values=tf.Tensor([1 2], shape=(2,), dtype=int64), dense_shape=tf.Tensor([2 1], shape=(2,), dtype=int64))
SparseTensor(indices=tf.Tensor(
[[0 0]
 [1 0]
 [2 0]], shape=(3, 2), dtype=int64), values=tf.Tensor([1 2 3], shape=(3,), dtype=int64), dense_shape=tf.Tensor([3 1], shape=(2,), dtype=int64))
SparseTensor(indices=tf.Tensor(
[[0 0]
 [1 0]], shape=(2, 2), dtype=int64), values=tf.Tensor([1 2], shape=(2,), dtype=int64), dense_shape=tf.Tensor([2 1], shape=(2,), dtype=int64))
SparseTensor(indices=tf.Tensor(
[[0 0]
 [1 0]
 [2 0]], shape=(3, 2), dtype=int64), values=tf.Tensor([1 2 3], shape=(3,), dtype=int64), dense_shape=tf.Tensor([3 1], shape=(2,), dtype=int64))
SparseTensor(indices=tf.Tensor(
[[0 0]
 [1 0]], shape=(2, 2), dtype=int64), values=tf.Tensor([1 2], shape=(2,), dtype=int64), dense_shape=tf.Tensor([2 1], shape=(2,), dtype=int64))
SparseTensor(indices=tf.Tensor(
[[0 0]
 [1 0]
 [2 0]], shape=(3, 2), dtype=int64), values=tf.Tensor([1 2 3], shape=(3,), dtype=int64), dense_shape=tf.Tensor([3 1], shape=(2,), dtype=int64))
SparseTensor(indices=tf.Tensor(
[[0 0]
 [1 0]], shape=(2, 2), dtype=int64), values=tf.Tensor([1 2], shape=(2,), dtype=int64), dense_shape=tf.Tensor([2 1], shape=(2,), dtype=int64))

对1、2、3的输出打印

我们都用一下代码进行打印,看一看

def single_example_parser(serialized_example):
    sequence_features = {'features_seq': tf.io.VarLenFeature(tf.int64)}#按Feature_list的长度提取
    _, sequence_parsed = tf.io.parse_single_sequence_example(serialized=serialized_example,sequence_features=sequence_features)
    seq = sequence_parsed['features_seq']
    return seq

file_path_list  = tf.data.Dataset.list_files(["./seq.tfrecord"])
dataset = tf.data.TFRecordDataset(file_path_list)
dataset = dataset.map(lambda x: single_example_parser(x))
for line in dataset:
    print(line)

第一个为:

SparseTensor(indices=tf.Tensor(

[[0 0]

[0 1]

[0 2]

[0 3]

[0 4]

[0 5]

[0 6]

[0 7]

[0 8]

[0 9]], shape=(10, 2), dtype=int64), values=tf.Tensor([1 2 3 4 5 6 7 8 9 0], shape=(10,), dtype=int64), dense_shape=tf.Tensor([ 1 10], shape=(2,), dtype=int64))

第二个为:

SparseTensor(indices=tf.Tensor(

[[0 0]

[0 1]

[0 2]

[0 3]

[0 4]

[0 5]

[0 6]

[0 7]

[0 8]

[0 9]], shape=(10, 2), dtype=int64), values=tf.Tensor([1 2 3 4 5 6 7 8 9 0], shape=(10,), dtype=int64), dense_shape=tf.Tensor([ 1 10], shape=(2,), dtype=int64))

SparseTensor(indices=tf.Tensor(

[[0 0]

[0 1]

[0 2]], shape=(3, 2), dtype=int64), values=tf.Tensor([1 2 3], shape=(3,), dtype=int64), dense_shape=tf.Tensor([1 3], shape=(2,), dtype=int64))

第三个为:

SparseTensor(indices=tf.Tensor(

[[0 0]

[0 1]

[0 2]

[0 3]

[0 4]

[0 5]

[0 6]

[0 7]

[0 8]

[0 9]

[1 0]

[1 1]

[1 2]

[1 3]

[1 4]

[1 5]

[1 6]

[1 7]

[1 8]

[1 9]], shape=(20, 2), dtype=int64), values=tf.Tensor([1 2 3 4 5 6 7 8 9 0 0 9 8 7 6 5 4 3 2 1], shape=(20,), dtype=int64), dense_shape=tf.Tensor([ 2 10], shape=(2,), dtype=int64))

也可以用4的方法进行

list(map(lambda input :tf.train.Feature(int64_list=tf.train.Int64List(value=[input])),seq_1))

逐个输入到

feature_list

中,逐次写入

import tensorflow as tf
seq_1 = [1,2,3,4,5,6,7,8,9,0]
seq_2 = [0,9,8,7,6,5,4,3,2,1]
#创建多序列特征
features_seq_1 = list(map(lambda input:tf.train.Feature(int64_list=tf.train.Int64List(value=[input])),seq_1))
features_seq_2 = list(map(lambda input:tf.train.Feature(int64_list=tf.train.Int64List(value=[input])),seq_2))
# features_seq_1 = tf.train.Feature(int64_list=tf.train.Int64List(value=seq_1))
# features_seq_2 = tf.train.Feature(int64_list=tf.train.Int64List(value=seq_2))
#创建特征list
feature_lists_1=tf.train.FeatureLists(feature_list={
'features_seq': tf.train.FeatureList(feature=features_seq_1)
                                         })
feature_lists_2=tf.train.FeatureLists(feature_list={
'features_seq': tf.train.FeatureList(feature=features_seq_2)
                                         })
#嵌套入example中
example_1 = tf.train.SequenceExample(feature_lists=feature_lists_1)
example_2 = tf.train.SequenceExample(feature_lists=feature_lists_2)
seq_writer = tf.io.TFRecordWriter("seq.tfrecord")
seq_writer.write(example_1.SerializeToString())
seq_writer.write(example_2.SerializeToString())


def single_example_parser(serialized_example):
    sequence_features = {'features_seq': tf.io.FixedLenSequenceFeature([], dtype=tf.int64)}#按Feature_list的长度提取
    _, sequence_parsed = tf.io.parse_single_sequence_example(serialized=serialized_example,sequence_features=sequence_features)
    seq = sequence_parsed['features_seq']
    return seq

file_path_list  = tf.data.Dataset.list_files(["./seq.tfrecord"])
dataset = tf.data.TFRecordDataset(file_path_list)
dataset = dataset.map(lambda x: single_example_parser(x))
for line in dataset:
    print(line)

此时输出为张量Tensor格式:

tf.Tensor([1 2 3 4 5 6 7 8 9 0], shape=(10,), dtype=int64)
tf.Tensor([0 9 8 7 6 5 4 3 2 1], shape=(10,), dtype=int64)

最后可以搭配shuffle()、map()、repeat()、batch()批量输出

dataset = tf.data.TFRecordDataset(tfrecord_filename).map(single_example_parser).shuffle(batch_size*10).batch(batch_size).repeat(2)



版权声明:本文为weixin_42253874原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。