1、单组列表数据集的存储
- 数据列表:seq
-
创建单个特征:tf.train.Feature(
列表
) - 创建特征列表:tf.train.FeatureList
- 嵌入序列实例:example = tf.train.SequenceExample
- 保存:seq_writer = tf.io.TFRecordWriter(“filename”);seq_writer.write(example.SrializeToString())
import tensorflow as tf
#创建单序列数据集
seq = [1,2,3,4,5,6,7,8,9,0]
#创建单个字符feature
features_seq = tf.train.Feature(int64_list=tf.train.Int64List(value=seq))
#创建特征list
feature_lists=tf.train.FeatureLists(feature_list={'features_seq': tf.train.FeatureList(feature=[features_seq])})
#嵌套入example中
example = tf.train.SequenceExample(feature_lists=feature_lists)
seq_writer = tf.io.TFRecordWriter("seq.tfrecord")
seq_writer.write(example.SerializeToString())
注意参数:
Feature的参数是list列表,FeatureList的参数feature_list为字典
2、包含列表的列表数据集的存储
- 使用for循环获取所含列表进行存储
import tensorflow as tf
seq_list = [[1,2,3,4,5,6,7,8,9,0],[1,2,3]]
seq_writer = tf.io.TFRecordWriter("seq.tfrecord")
for seq in seq_list:
#创建单个字符feature
features_seq = tf.train.Feature(int64_list=tf.train.Int64List(value=seq))
#创建特征list
feature_lists=tf.train.FeatureLists(feature_list={'features_seq': tf.train.FeatureList(feature=[features_seq])})
#嵌套入example中
example = tf.train.SequenceExample(feature_lists=feature_lists)
seq_writer.write(example.SerializeToString())
注意参数:
Feature的参数是list列表,FeatureList的参数feature_list为字典,循环嵌入example并写入
3、并列列表数据集的存储
并列与单组列表没什么区别,逐一创建特征Feature并在特征列表FeatureList中进行整合即可
import tensorflow as tf
seq_1 = [1,2,3,4,5,6,7,8,9,0]
seq_2 = [0,9,8,7,6,5,4,3,2,1]
#创建多序列特征
features_seq_1 = tf.train.Feature(int64_list=tf.train.Int64List(value=seq_1))
features_seq_2 = tf.train.Feature(int64_list=tf.train.Int64List(value=seq_2))
#创建特征list
feature_lists=tf.train.FeatureLists(feature_list={
'features_seq': tf.train.FeatureList(feature=[features_seq_1,features_seq_2])
})
#嵌套入example中
example = tf.train.SequenceExample(feature_lists=feature_lists)
seq_writer = tf.io.TFRecordWriter("seq.tfrecord")
seq_writer.write(example.SerializeToString())
注意参数:
Feature的参数是list列表,FeatureList的参数feature_list为字典,字典内部包含数组[features_seq_1,features_seq_2]
4、列表数据集写入与读取
待写入数据如下:
seq_list = [[1, 2, 3], [1, 2], [1, 2, 3], [1, 2], [1, 2, 3], [1, 2], [1, 2, 3], [1, 2]]
首先构建写入函数,注意这里是map逐个元素作Feature,然后构成list(Feature) ->
encoder_smiles_input_feature
def generate_tfrecords(tfrecod_filename,seq_list):
with tf.io.TFRecordWriter(tfrecod_filename) as f:
for seq in (seq_list):
#生成一个Feature列表
encoder_smiles_input_feature = list(
map(lambda seq_input:tf.train.Feature(int64_list=tf.train.Int64List(value=[seq_input])), seq)
)
feature_list = tf.train.FeatureLists(feature_list=
{'seq_feature':tf.train.FeatureList(feature=encoder_smiles_input_feature)})
example = tf.train.SequenceExample(feature_lists= feature_list)
f.write(example.SerializeToString())
构建读取函数,
主要为
tf.io.parse_single_sequence_example
和参数
sequence_features
对于
tf.io.FixedLenSequenceFeature([], dtype=tf.int64)
可用不定长度的
tf.io.VarLenFeature(tf.int64)
替换,此时对shape没有要求,不过输出为
稀疏张量SparseTensor
。否则当shape不对时,会报错:
values size: 10 but output shape: []
,当然可以给参数10来解决,但当成员shape不统一时只能用不定长的了
def single_example_parser(serialized_example):
sequence_features = {"seq_feature": tf.io.FixedLenSequenceFeature([], dtype=tf.int64)}#按Feature_list的长度提取
_, sequence_parsed = tf.io.parse_single_sequence_example(serialized=serialized_example,sequence_features=sequence_features)
seq = sequence_parsed['seq_feature']
return seq
完整代码如下
import tensorflow as tf
seq_list = [[1, 2, 3], [1, 2], [1, 2, 3], [1, 2], [1, 2, 3], [1, 2], [1, 2, 3], [1, 2]]
def generate_tfrecords(tfrecod_filename,seq_list):
with tf.io.TFRecordWriter(tfrecod_filename) as f:
for seq in (seq_list):
#生成一个Feature列表
encoder_smiles_input_feature = list(map(lambda seq_input: tf.train.Feature(int64_list=tf.train.Int64List(value=[seq_input])), seq))
feature_list = tf.train.FeatureLists(feature_list={'seq_feature': tf.train.FeatureList(feature=encoder_smiles_input_feature)})
example = tf.train.SequenceExample(feature_lists= feature_list)
f.write(example.SerializeToString())
# def single_example_parser(serialized_example):
# sequence_features = {"seq_feature": tf.data.FixedLengthRecordDataset([], record_bytes=tf.int64)}#按记录的长度提取
# _, sequence_parsed = tf.parse_single_sequence_example(serialized=serialized_example,sequence_features=sequence_features)
# seq = sequence_parsed['seq_feature']
# return seq
tfrecord_filename = './seq_dataset.tfrecord' #存储TFRecord数据的地址
generate_tfrecords(tfrecord_filename,seq_list) #生成TFRecord数据
def single_example_parser(serialized_example):
sequence_features = {"seq_feature": tf.io.FixedLenSequenceFeature([], dtype=tf.int64)}#按Feature_list的长度提取
_, sequence_parsed = tf.io.parse_single_sequence_example(serialized=serialized_example,sequence_features=sequence_features)
seq = sequence_parsed['seq_feature']
return seq
file_path_list = tf.data.Dataset.list_files(["./seq_dataset.tfrecord"])
dataset = tf.data.TFRecordDataset(file_path_list)
dataset = dataset.map(lambda x: single_example_parser(x))
for line in dataset:
print(line)
打印如下:以
tensor格式
输出
tf.Tensor([1 2 3], shape=(3,), dtype=int64)
tf.Tensor([1 2], shape=(2,), dtype=int64)
tf.Tensor([1 2 3], shape=(3,), dtype=int64)
tf.Tensor([1 2], shape=(2,), dtype=int64)
tf.Tensor([1 2 3], shape=(3,), dtype=int64)
tf.Tensor([1 2], shape=(2,), dtype=int64)
tf.Tensor([1 2 3], shape=(3,), dtype=int64)
tf.Tensor([1 2], shape=(2,), dtype=int64)
当我们把sequence_feature换成不定长时,打印如下:以
稀疏张量SparseTensor
格式输出
SparseTensor(indices=tf.Tensor(
[[0 0]
[1 0]
[2 0]], shape=(3, 2), dtype=int64), values=tf.Tensor([1 2 3], shape=(3,), dtype=int64), dense_shape=tf.Tensor([3 1], shape=(2,), dtype=int64))
SparseTensor(indices=tf.Tensor(
[[0 0]
[1 0]], shape=(2, 2), dtype=int64), values=tf.Tensor([1 2], shape=(2,), dtype=int64), dense_shape=tf.Tensor([2 1], shape=(2,), dtype=int64))
SparseTensor(indices=tf.Tensor(
[[0 0]
[1 0]
[2 0]], shape=(3, 2), dtype=int64), values=tf.Tensor([1 2 3], shape=(3,), dtype=int64), dense_shape=tf.Tensor([3 1], shape=(2,), dtype=int64))
SparseTensor(indices=tf.Tensor(
[[0 0]
[1 0]], shape=(2, 2), dtype=int64), values=tf.Tensor([1 2], shape=(2,), dtype=int64), dense_shape=tf.Tensor([2 1], shape=(2,), dtype=int64))
SparseTensor(indices=tf.Tensor(
[[0 0]
[1 0]
[2 0]], shape=(3, 2), dtype=int64), values=tf.Tensor([1 2 3], shape=(3,), dtype=int64), dense_shape=tf.Tensor([3 1], shape=(2,), dtype=int64))
SparseTensor(indices=tf.Tensor(
[[0 0]
[1 0]], shape=(2, 2), dtype=int64), values=tf.Tensor([1 2], shape=(2,), dtype=int64), dense_shape=tf.Tensor([2 1], shape=(2,), dtype=int64))
SparseTensor(indices=tf.Tensor(
[[0 0]
[1 0]
[2 0]], shape=(3, 2), dtype=int64), values=tf.Tensor([1 2 3], shape=(3,), dtype=int64), dense_shape=tf.Tensor([3 1], shape=(2,), dtype=int64))
SparseTensor(indices=tf.Tensor(
[[0 0]
[1 0]], shape=(2, 2), dtype=int64), values=tf.Tensor([1 2], shape=(2,), dtype=int64), dense_shape=tf.Tensor([2 1], shape=(2,), dtype=int64))
对1、2、3的输出打印
我们都用一下代码进行打印,看一看
def single_example_parser(serialized_example):
sequence_features = {'features_seq': tf.io.VarLenFeature(tf.int64)}#按Feature_list的长度提取
_, sequence_parsed = tf.io.parse_single_sequence_example(serialized=serialized_example,sequence_features=sequence_features)
seq = sequence_parsed['features_seq']
return seq
file_path_list = tf.data.Dataset.list_files(["./seq.tfrecord"])
dataset = tf.data.TFRecordDataset(file_path_list)
dataset = dataset.map(lambda x: single_example_parser(x))
for line in dataset:
print(line)
第一个为:
SparseTensor(indices=tf.Tensor(
[[0 0]
[0 1]
[0 2]
[0 3]
[0 4]
[0 5]
[0 6]
[0 7]
[0 8]
[0 9]], shape=(10, 2), dtype=int64), values=tf.Tensor([1 2 3 4 5 6 7 8 9 0], shape=(10,), dtype=int64), dense_shape=tf.Tensor([ 1 10], shape=(2,), dtype=int64))
第二个为:
SparseTensor(indices=tf.Tensor(
[[0 0]
[0 1]
[0 2]
[0 3]
[0 4]
[0 5]
[0 6]
[0 7]
[0 8]
[0 9]], shape=(10, 2), dtype=int64), values=tf.Tensor([1 2 3 4 5 6 7 8 9 0], shape=(10,), dtype=int64), dense_shape=tf.Tensor([ 1 10], shape=(2,), dtype=int64))
SparseTensor(indices=tf.Tensor(
[[0 0]
[0 1]
[0 2]], shape=(3, 2), dtype=int64), values=tf.Tensor([1 2 3], shape=(3,), dtype=int64), dense_shape=tf.Tensor([1 3], shape=(2,), dtype=int64))
第三个为:
SparseTensor(indices=tf.Tensor(
[[0 0]
[0 1]
[0 2]
[0 3]
[0 4]
[0 5]
[0 6]
[0 7]
[0 8]
[0 9]
[1 0]
[1 1]
[1 2]
[1 3]
[1 4]
[1 5]
[1 6]
[1 7]
[1 8]
[1 9]], shape=(20, 2), dtype=int64), values=tf.Tensor([1 2 3 4 5 6 7 8 9 0 0 9 8 7 6 5 4 3 2 1], shape=(20,), dtype=int64), dense_shape=tf.Tensor([ 2 10], shape=(2,), dtype=int64))
也可以用4的方法进行
list(map(lambda input :tf.train.Feature(int64_list=tf.train.Int64List(value=[input])),seq_1))
逐个输入到
feature_list
中,逐次写入
import tensorflow as tf
seq_1 = [1,2,3,4,5,6,7,8,9,0]
seq_2 = [0,9,8,7,6,5,4,3,2,1]
#创建多序列特征
features_seq_1 = list(map(lambda input:tf.train.Feature(int64_list=tf.train.Int64List(value=[input])),seq_1))
features_seq_2 = list(map(lambda input:tf.train.Feature(int64_list=tf.train.Int64List(value=[input])),seq_2))
# features_seq_1 = tf.train.Feature(int64_list=tf.train.Int64List(value=seq_1))
# features_seq_2 = tf.train.Feature(int64_list=tf.train.Int64List(value=seq_2))
#创建特征list
feature_lists_1=tf.train.FeatureLists(feature_list={
'features_seq': tf.train.FeatureList(feature=features_seq_1)
})
feature_lists_2=tf.train.FeatureLists(feature_list={
'features_seq': tf.train.FeatureList(feature=features_seq_2)
})
#嵌套入example中
example_1 = tf.train.SequenceExample(feature_lists=feature_lists_1)
example_2 = tf.train.SequenceExample(feature_lists=feature_lists_2)
seq_writer = tf.io.TFRecordWriter("seq.tfrecord")
seq_writer.write(example_1.SerializeToString())
seq_writer.write(example_2.SerializeToString())
def single_example_parser(serialized_example):
sequence_features = {'features_seq': tf.io.FixedLenSequenceFeature([], dtype=tf.int64)}#按Feature_list的长度提取
_, sequence_parsed = tf.io.parse_single_sequence_example(serialized=serialized_example,sequence_features=sequence_features)
seq = sequence_parsed['features_seq']
return seq
file_path_list = tf.data.Dataset.list_files(["./seq.tfrecord"])
dataset = tf.data.TFRecordDataset(file_path_list)
dataset = dataset.map(lambda x: single_example_parser(x))
for line in dataset:
print(line)
此时输出为张量Tensor格式:
tf.Tensor([1 2 3 4 5 6 7 8 9 0], shape=(10,), dtype=int64)
tf.Tensor([0 9 8 7 6 5 4 3 2 1], shape=(10,), dtype=int64)
最后可以搭配shuffle()、map()、repeat()、batch()批量输出
dataset = tf.data.TFRecordDataset(tfrecord_filename).map(single_example_parser).shuffle(batch_size*10).batch(batch_size).repeat(2)