tensorflow基本操作

  • Post author:
  • Post category:其他




一、图使用设置

import os
import tensorflow as tf

# True:当GPU不存在或者程序中出现GPU不能运行的代码时,自动切换到CPU运行
tf_config = tf.ConfigProto(allow_soft_placement=True)

# 按照PCI_BUS_ID顺序从0开始排列GPU设备
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# 值为'0,1'表示按顺序使用'/gpu:0'、'/gpu:1',值为'0'表示使用'/gpu:0',值为''表示使用cpu
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

# 每个线程所能占用的显存占空闲显存的比例
tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5

# # 需要多少显存就用多少
# tf_config.gpu_options.allow_growth = True

# 指定图,若只有一个图时可省略
g = tf.Graph()
sess = tf.Session(config=tf_config, graph=g)

# 若只有一个图时,可以省略
with g.as_default():
    # 指定用gpu:0计算,可以省略
    with tf.device('/gpu:0'):
        a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
        b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
        c = tf.matmul(a, b)

# 计算图中节点
c = sess.run([c], feed_dict={})
print(c)

二、tensorboard使用

import tensorflow as tf

# python -m tensorboard.main --logdir logs
#tensorboard --logdir=logs
#http://localhost:6006/

def network(X):
    return
def get_loss(logits,Y):
    return

X=tf.placeholder(tf.int32,[None,None])
Y=tf.placeholder(tf.int32,[None])
logits=network(X)
loss=get_loss(logits,Y)

#散点图
tf.summary.scalar('loss_value', loss)
merged = tf.summary.merge_all()

with tf.variable_scope('train'):
    train_op = tf.train.RMSPropOptimizer(0.001).minimize(loss)#学习率

# init=tf.global_variables_initializer()
init = (tf.global_variables_initializer(), tf.local_variables_initializer())
with tf.Session() as sess:

    writer = tf.summary.FileWriter('logs/')
    writer.add_graph(sess.graph)

    sess.run(init)
    for i in range(3):
        x=y=[i,i]


        _, _loss,mer = sess.run([train_op, loss,merged], feed_dict={X: x, Y: y})
        writer.add_summary(mer,i)

三、tensorflow操作

#占位符,形状不确定是用None代替
a = tf.placeholder(dtype=tf.int32, shape=[None, None], name="a")
b = tf.placeholder(dtype=tf.float32, shape=[None, None, None], name="b")

#常数值
c = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='c')

#获取张量形状
a_shape = tf.shape(a)

#tf.reshape函数,对张量进行形状改变
#a_shape = [2,3]
a = tf.reshape(a ,shape = [3,2])  --> a_shape = [3,2]

#tf.sign函数,对每个值与0比较,输出固定的值(-1,0,1)
# x>0 --> 1
# x==0 --> 0
# x<0 --> -1
a_sign = tf.sign(a)

#tf.abs函数,对每个值求绝对值
a_abs = tf.abs(a)

#tf.reduce_sum函数,对某几个维度求和
c = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='c')
c_s = tf.reduce_sum(c, reduction_indices=1) --> [6,15]
c_s = tf.reduce_sum(c, reduction_indices=1, keep_dims = True) --> [[6],[15]]
#相当于连续运行c = tf.reduce_sum(c, reduction_indices=1),c = tf.reduce_sum(c, reduction_indices=1)
c_s = tf.reduce_sum(c, reduction_indices=[0,1]) -->21

#tf.reduce_mean函数,对某几个维度求平均
c_m = tf.reduce_mean(c, axis=-1) --> [2,5]

#tf.cast函数,对张量做强制数据类型转换
c_c = tf.cast(c, tf.int32)

# tf.sequence_mask函数,生成进行mask的矩阵
mask = tf.sequence_mask(lengths=[1,3], maxlen=2, dtype=tf.float32) -->[[1,0],[1,1]]

#tf.expand_dims函数,对张量进行扩充维度
#c_shape = [2,3]
c = tf.expand_dims(c, axis=-1)  --> c_shape = [2,3,1]

#tf.concat函数,对几个张量进行拼接
#c_shape = [2,3]
c = tf.concat([c, c], axis = 1) -->  c_shape = [2,6]

#tf.pad函数,对张量进行填充
#mode 可以取三个值,分别是"CONSTANT" ,“REFLECT”,“SYMMETRIC”
#mode=“CONSTANT” 填充0
#mode="REFLECT"映射填充,上下(1维)填充顺序和paddings是相反的,左右(零维)顺序补齐
#mode="SYMMETRIC"对称填充,上下(1维)填充顺序是和paddings相同的,左右(零维)对称补齐
#在第二个维度上下各加一行0, [2,1]表示上方两行0,下方一行0
# c = [[1,2],[3,4]]
c = tf.pad(c, [[0, 0], [1, 1]], "CONSTANT") --> [0., 1., 2., 0.],[0., 3., 4., 0.]]
c = tf.pad(c, [[0, 0], [1, 1]], "REFLECT") --> [[2., 1., 2., 1.],[4., 3., 4., 3.]]
c = tf.pad(c, [[0, 0], [1, 1]], "SYMMETRIC") --> [[1., 1., 2., 2.],[3., 3., 4., 4.]]

# tf.image.extract_patches函数,从图片中截取固定大小的部分
b = tf.constant(list(range(1,26)), shape=[1,5, 5,1], name='a')
# sizes用于决定每个补丁的大小,或者换句话说,每个补丁应该包含多少像素。
# strides表示原始图像中一个面片开始与下一个连续面片开始之间的间隙长度。
# rates是一个数字,它本质上意味着我们的补丁应该在原始图像中跳rates个像素,每一个连续的像素结束在我们的补丁中。(下面的示例有助于说明这一点。)
# padding要么是“VALID”,这意味着每个补丁必须完全包含在图像中,要么是“SAME”,这意味着允许补丁不完整(剩余的像素将用零填充)。
b = tf.image.extract_patches(b, sizes=[1, 1, 2, 1], strides=[1, 1, 1, 1], rates=[1, 1, 1, 1],padding='SAME')

#tf.zeros_like函数,建一个形状如b,元素全是0的张量
zeros = tf.zeros_like(b)

#tf.ones_like函数,建一个形状如b,元素全是1的张量
ons = tf.ones_like(b)

#tf.eye函数,建一个形状为[n,n]的对角矩阵
onehot = tf.eye(n)

#tf.where函数,满足条件则选第一个值,否则第二个值
#小于0的值全变为0
a = tf.where(a < 0, zeros, a)

#tf.squeeze函数,删掉维度元素个数为1的维度
# a.shape = [1,2,3,1,1]
tf.squeeze(a) --> shape = [2,3]
tf.squeeze(a,axis=-1) --> shape = [1,2,3,1]
tf.squeeze(a,axis=[0,3]) --> shape = [2,3,1]

#tf.transpose函数,对维度进行转置
# a.shape = [1,2,3,4]
tf.transpose(a, perm=[0, 1, 3, 2]) --> shape = [1,2,4,3]

#tf.tile函数,对维度数据进行复制扩展
# a.shape = [2,3]
# 第一维复制4次,第二维复制2次
tf.tile(a, perm=[4, 2]) --> shape = [8,6]



三、全连接层

W = tf.get_variable(name='W', shape=[in_size, output_size], dtype=tf.float32,initializer=tf.truncated_normal_initializer())
B = tf.get_variable(name='B', shape=[output_size], dtype=tf.float32, initializer=tf.zeros_initializer())
# input.shape = [-1,in_size]
output = tf.nn.xw_plus_b(input, W, B)
output = tf.nn.dropout(output, 1 - dropout)



四、lstm/gru层

cell={}
for name in ['forward','backward']:
	# # 实例化一个grucell
	#cell[name] = tf.contrib.rnn.GRUCell(num_units=gru_dim) 
	# 实例化一个lstmcell
    cell[name]=tf.contrib.rnn.BasicLSTMCell(num_units=lstm_dim)
outputs,final_states=tf.nn.bidirectional_dynamic_rnn(
    cell['forward'],
    cell['backward'],
    inputs,#必须是三维  batch,max_length,dim
    dtype=tf.float32,
    sequence_length=length#给定真实句子长度
)
#outputs:元组 第一个是前向cell的输出【10,6,300】 第二个是后向cell的输出【10,6,300】
#final_states 元组  第一个是前向cell的最终的c和h c:【10,300】h:【10,300】
# cfw,hfw=final_states[0]#拿到前向cell的c和h
# cbw,hbw=final_states[1]#拿到后向cell的c和h
# output=tf.concat([cfw,hfw,cbw,hbw],-1)#四个状态进行拼接  【10,1200】



五、cnn层

filter_size = [3,3,in_channel,out_channel]
filter_w = tf.Variable(tf.truncated_normal(filter_size, mean=0.0, stddev=0.1), name="filter_w")
filter_b = tf.Variable(tf.zeros(bacth_size), name="filter_b")

# input = [1,3,3,5] --> 1张3*3 大小的图片,通道数是5
# filter_size = [2,2,5,1] --> 卷积核是 2*2 大小,进来的通道数是5,出去的通道数是1
# strides=[1, 1, 1, 1],第一位和最后一位一般是1(不在批次和通道上做跳跃),中间两位表示横移和纵移步长都是1
# padding值为“SAME” 和 “VALID”,表示的是卷积的形式,是否考虑边界。"SAME"是考虑边界,不足的时候用0去填充周围,"VALID"则不考虑
l1 = tf.nn.conv2d(input=input_data,filter=filter_w,strides=[1, 1, 1, 1],padding="same",name="conv")
# 激活
l1= tf.nn.relu(tf.nn.bias_add(l1, filter_b), name="activations")
# 池化(tf.nn.max_pool,tf.nn.avg_pool)
# ksize 一般是[1, height, width, 1],和input的维度意义一致
max_pool = tf.nn.max_pool(value=l1,ksize=[1, 3, 3, 1],strides=[1, 1, 1, 1],padding="VALID",name="max_pool")
max_pool = tf.nn.dropout(max_pool , 1 - dropout)



六、attention层

 def attention(inputs, attention_size, time_major=False, return_alphas=False):
    if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        inputs = tf.concat(inputs, 2)

    if time_major:
        # (T,B,D) => (B,T,D)
        inputs = tf.array_ops.transpose(inputs, [1, 0, 2])

    hidden_size = inputs.shape[2].value  # D value - hidden size of the RNN layer

    # Trainable parameters
    w_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
    b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
    u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))

    with tf.name_scope('v'):
        # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
        #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
        v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)

    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
    alphas = tf.nn.softmax(vu, name='alphas')  # (B,T) shape

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)

    if not return_alphas:
        return output
    else:
        return output, alphas



七、transformer层

class model():
    def network(self):
        # 定义模型的输入
        self.inputX = tf.placeholder(tf.int32, [None, self.config['STEN_LEN']], name="inputX")
        self.inputY = tf.placeholder(tf.int32, [None], name="inputY")

        self.dropoutKeepProb = tf.placeholder(tf.float32, name="dropoutKeepProb")
        self.embeddedPosition = tf.placeholder(tf.float32, [None, self.config['STEN_LEN'], self.config['STEN_LEN']],name="embeddedPosition")

        # 定义l2损失
        l2Loss = tf.constant(0.0)

        # 词嵌入层, 位置向量的定义方式有两种:一是直接用固定的one-hot的形式传入,然后和词向量拼接,在当前的数据集上表现效果更好。另一种
        # 就是按照论文中的方法实现,这样的效果反而更差,可能是增大了模型的复杂度,在小数据集上表现不佳。

        with tf.name_scope("embedding"):
            # 利用预训练的词向量初始化词嵌入矩阵
            self.W = tf.get_variable(name='W', shape=[len(self.char_to_id), self.config['CHAR_DIM']],
                                     initializer=tf.truncated_normal_initializer())
            # self.W = tf.Variable(tf.cast(wordEmbedding, dtype=tf.float32, name="word2vec"), name="W")
            # 利用词嵌入矩阵将输入的数据中的词转换成词向量,维度[batch_size, sequence_length, embedding_size]
            self.embedded = tf.nn.embedding_lookup(self.W, self.inputX)
            self.embeddedWords = tf.concat([self.embedded, self.embeddedPosition], -1)

        with tf.name_scope("transformer"):
            for i in range(self.config['NUM_BLOCKS']):
                with tf.name_scope("transformer-{}".format(i + 1)):
                    # 维度[batch_size, sequence_length, embedding_size]
                    multiHeadAtt = self._multiheadAttention(rawKeys=self.inputX, queries=self.embeddedWords,
                                                            keys=self.embeddedWords)
                    # 维度[batch_size, sequence_length, embedding_size]
                    self.embeddedWords = self._feedForward(multiHeadAtt,[self.config['FILTERS'], self.config['CHAR_DIM'] + self.config['STEN_LEN']])

            outputs = tf.reshape(self.embeddedWords,[-1, self.config['STEN_LEN'] * (self.config['CHAR_DIM'] + self.config['STEN_LEN'])])

        outputSize = outputs.get_shape()[-1].value

        with tf.name_scope("dropout"):
            outputs = tf.nn.dropout(outputs, keep_prob=self.dropoutKeepProb)

        # 全连接层的输出
        with tf.name_scope("output"):
            outputW = tf.get_variable("outputW", shape=[outputSize, self.class_num],initializer=tf.contrib.layers.xavier_initializer())

            outputB = tf.Variable(tf.constant(0.1, shape=[self.class_num]), name="outputB")
            l2Loss += tf.nn.l2_loss(outputW)
            l2Loss += tf.nn.l2_loss(outputB)
            self.logits = tf.nn.xw_plus_b(outputs, outputW, outputB, name="logits")
            self.predictions = tf.cast(tf.argmax(self.logits, -1), tf.int32)
            self.acc = tf.reduce_mean(tf.cast(tf.equal(self.predictions, self.inputY), tf.float32), name="acc")

        # 计算二元交叉熵损失
        with tf.name_scope("loss"):
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.inputY)
            self.loss = tf.reduce_mean(losses) + self.config['L2'] * l2Loss

    def _layerNormalization(self, inputs, scope="layerNorm"):
        # LayerNorm层和BN层有所不同


        inputsShape = inputs.get_shape()  # [batch_size, sequence_length, embedding_size]

        paramsShape = inputsShape[-1:]

        # LayerNorm是在最后的维度上计算输入的数据的均值和方差,BN层是考虑所有维度的
        # mean, variance的维度都是[batch_size, sequence_len, 1]
        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)

        beta = tf.Variable(tf.zeros(paramsShape))

        gamma = tf.Variable(tf.ones(paramsShape))
        #标准化,防止除数为0而加1e-8
        normalized = (inputs - mean) / ((variance + 1e-8) ** .5)

        outputs = gamma * normalized + beta

        return outputs

    def _multiheadAttention(self, rawKeys, queries, keys, numUnits=None, causality=False, scope="multiheadAttention"):
        # rawKeys 的作用是为了计算mask时用的,因为keys是加上了position embedding的,其中不存在padding为0的值

        if numUnits is None:  # 若是没传入值,直接去输入数据的最后一维,即embedding size.
            numUnits = queries.get_shape().as_list()[-1]

        # tf.layers.dense可以做多维tensor数据的非线性映射,在计算self-Attention时,一定要对这三个值进行非线性映射,
        # 其实这一步就是论文中Multi-Head Attention中的对分割后的数据进行权重映射的步骤,我们在这里先映射后分割,原则上是一样的。
        # Q, K, V的维度都是[batch_size, sequence_length, embedding_size]
        Q = tf.layers.dense(queries, numUnits, activation=tf.nn.relu)
        K = tf.layers.dense(keys, numUnits, activation=tf.nn.relu)
        V = tf.layers.dense(keys, numUnits, activation=tf.nn.relu)

        # 将数据按最后一维分割成num_heads个, 然后按照第一维拼接
        # Q, K, V 的维度都是[batch_size * numHeads, sequence_length, embedding_size/numHeads]
        Q_ = tf.concat(tf.split(Q, self.config['NUM_HEADS'], axis=-1), axis=0)
        K_ = tf.concat(tf.split(K, self.config['NUM_HEADS'], axis=-1), axis=0)
        V_ = tf.concat(tf.split(V, self.config['NUM_HEADS'], axis=-1), axis=0)

        # 计算keys和queries之间的点积,维度[batch_size * numHeads, queries_len, key_len], 后两维是queries和keys的序列长度
        similary = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))

        # 对计算的点积进行缩放处理,除以向量长度的根号值
        scaledSimilary = similary / (K_.get_shape().as_list()[-1] ** 0.5)

        # 在我们输入的序列中会存在padding这个样的填充词,这种词应该对最终的结果是毫无帮助的,原则上说当padding都是输入0时,
        # 计算出来的权重应该也是0,但是在transformer中引入了位置向量,当和位置向量相加之后,其值就不为0了,因此在添加位置向量
        # 之前,我们需要将其mask为0。虽然在queries中也存在这样的填充词,但原则上模型的结果之和输入有关,而且在self-Attention中
        # queryies = keys,因此只要一方为0,计算出的权重就为0。
        # 具体关于key mask的介绍可以看看这里: https://github.com/Kyubyong/transformer/issues/3

        # 利用tf,tile进行张量扩张, 维度[batch_size * numHeads, keys_len] keys_len = keys 的序列长度
        keyMasks = tf.tile(rawKeys, [self.config['NUM_HEADS'], 1])

        # 增加一个维度,并进行扩张,得到维度[batch_size * numHeads, queries_len, keys_len]
        keyMasks = tf.tile(tf.expand_dims(keyMasks, 1), [1, tf.shape(queries)[1], 1])

        # tf.ones_like生成元素全为1,维度和scaledSimilary相同的tensor, 然后得到负无穷大的值
        paddings = tf.ones_like(scaledSimilary) * (-2 ** (32 + 1))

        # tf.where(condition, x, y),condition中的元素为bool值,其中对应的True用x中的元素替换,对应的False用y中的元素替换
        # 因此condition,x,y的维度是一样的。下面就是keyMasks中的值为0就用paddings中的值替换
        maskedSimilary = tf.where(tf.equal(keyMasks, 0), paddings,scaledSimilary)  # 维度[batch_size * numHeads, queries_len, key_len]

        # 在计算当前的词时,只考虑上文,不考虑下文,出现在Transformer Decoder中。在文本分类时,可以只用Transformer Encoder。
        # Decoder是生成模型,主要用在语言生成中
        if causality:
            diagVals = tf.ones_like(maskedSimilary[0, :, :])  # [queries_len, keys_len]
            tril = tf.contrib.linalg.LinearOperatorTriL(diagVals).to_dense()  # [queries_len, keys_len]
            masks = tf.tile(tf.expand_dims(tril, 0),[tf.shape(maskedSimilary)[0], 1, 1])  # [batch_size * numHeads, queries_len, keys_len]

            paddings = tf.ones_like(masks) * (-2 ** (32 + 1))
            maskedSimilary = tf.where(tf.equal(masks, 0), paddings,maskedSimilary)  # [batch_size * numHeads, queries_len, keys_len]

        # 通过softmax计算权重系数,维度 [batch_size * numHeads, queries_len, keys_len]
        weights = tf.nn.softmax(maskedSimilary)

        # 加权和得到输出值, 维度[batch_size * numHeads, sequence_length, embedding_size/numHeads]
        outputs = tf.matmul(weights, V_)

        # 将多头Attention计算的得到的输出重组成最初的维度[batch_size, sequence_length, embedding_size]
        outputs = tf.concat(tf.split(outputs, self.config['NUM_HEADS'], axis=0), axis=2)

        outputs = tf.nn.dropout(outputs, keep_prob=self.config['MULTI_KEEP_PROB'])

        # 对每个subLayers建立残差连接,即H(x) = F(x) + x
        outputs += queries
        # normalization 层
        outputs = self._layerNormalization(outputs)
        return outputs

    def _feedForward(self, inputs, filters, scope="multiheadAttention"):
        # 在这里的前向传播采用卷积神经网络

        # 内层
        # filters:过滤器(卷积核)的数目
        # kernel_size:卷积核的大小,卷积核本身应该是二维的,这里只需要指定一维,因为第二个维度即长度与词向量的长度一致,卷积核只能从上往下走,不能从左往右走,即只能按照文本中词的顺序,也是列的顺序。
        params = {"inputs": inputs, "filters": filters[0], "kernel_size": 1,
                  "activation": tf.nn.relu, "use_bias": True}
        outputs = tf.layers.conv1d(**params)

        # 外层
        params = {"inputs": outputs, "filters": filters[1], "kernel_size": 1,
                  "activation": None, "use_bias": True}

        # 这里用到了一维卷积,实际上卷积核尺寸还是二维的,只是只需要指定高度,宽度和embedding size的尺寸一致
        # 维度[batch_size, sequence_length, embedding_size]
        outputs = tf.layers.conv1d(**params)

        # 残差连接
        outputs += inputs

        # 归一化处理
        outputs = self._layerNormalization(outputs)

        return outputs

    def _positionEmbedding(self, scope="positionEmbedding"):
        # 生成可训练的位置向量

        # 生成位置的索引,并扩张到batch中所有的样本上
        positionIndex = tf.tile(tf.expand_dims(tf.range(self.config['STEN_LEN']), 0), [self.config['BATCH_SIZE'], 1])

        # 根据正弦和余弦函数来获得每个位置上的embedding的第一部分
        positionEmbedding = np.array([[pos / np.power(10000, (i - i % 2) / self.config['CHAR_DIM']) for i in range(self.config['CHAR_DIM'])]
                                      for pos in range(self.config['STEN_LEN'])])

        # 然后根据奇偶性分别用sin和cos函数来包装
        positionEmbedding[:, 0::2] = np.sin(positionEmbedding[:, 0::2])
        positionEmbedding[:, 1::2] = np.cos(positionEmbedding[:, 1::2])

        # 将positionEmbedding转换成tensor的格式
        positionEmbedding_ = tf.cast(positionEmbedding, dtype=tf.float32)

        # 得到三维的矩阵[batchSize, sequenceLen, embeddingSize]
        positionEmbedded = tf.nn.embedding_lookup(positionEmbedding_, positionIndex)

        return positionEmbedded


    # 生成位置嵌入
    def fixedPositionEmbedding(self,batch_size):
        embeddedPosition = []
        for batch in range(batch_size):
            x = []
            for step in range(self.config['STEN_LEN']):
                a = np.zeros(self.config['STEN_LEN'])
                a[step] = 1
                x.append(a)
            embeddedPosition.append(x)

        return np.array(embeddedPosition, dtype="float32")



八、embedding 层

char_lookup = tf.get_variable(
                    name="char_embedding",
                    shape=[num_chars,char_dim],
                    initializer=self.initializer)
embedding = tf.nn.embedding_lookup(char_lookup, char_inputs)



九、损失

onehot = tf.eye(self.num_tags)
#[batch,seq_len, num_tags],做onehot映射
targets = tf.nn.embedding_lookup(onehot, self.targets)

# # 交叉熵损失
# loss_T = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=targets))

# MAE:平均绝对误差 --> sum(|y_target - y_pre|)/num
loss_M = tf.reduce_mean(tf.losses.absolute_difference(logits,targets))



十、优化器

'''
tf.train.AdagradOptimizer(learning_rate, initial_accumulator_value=0.1, use_locking=False,name='Adagrad')
    Adagrad 的主要优势在于不需要人为的调节学习率,它可以自动调节;缺点在于,随着迭代次数增多,学习率会越来越小,最终会趋近于0。
'''
opt = tf.train.AdagradOptimizer(lr)


'''
tf.train.MomentumOptimizer(learning_rate, momentum, use_locking=False, name='Momentum', use_nesterov=False)
   使用Momentum算法的Optimizer,使用动量(Momentum)的随机梯度下降法(SGD),主要思想是引入一个积攒历史梯度信息动量来加速SGD。
   动量优化法的优点是收敛快,不容易陷入局部最优解,但是缺点是有时候会冲过头了,使得结果不够精确。
   如果使得use_nesterov=True,则该优化器实现牛顿加速梯度(NAG, Nesterov accelerated gradient)算法,该算法是Momentum动量算法的变种。
'''
opt = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9, use_nesterov=True)


'''
tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, name='RMSProp')
   RMSProp算法修改了AdaGrad的梯度积累为指数加权的移动平均,使得其在非凸设定下效果更好。
   RMSProp算法在经验上已经被证明是一种有效且实用的深度神经网络优化算法。目前它是深度学习从业者经常采用的优化方法之一。
'''
opt = tf.train.RMSPropOptimizer(lr)


'''
tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam')
   Adam中动量直接并入了梯度一阶矩(指数加权)的估计。相比于缺少修正因子导致二阶矩估计可能在训练初期具有很高偏置的RMSProp,Adam包括偏置修正,修正从原点初始化的一阶矩(动量项)和(非中心的)二阶矩估计。
   Adam通常被认为对超参数的选择相当鲁棒,尽管学习率有时需要从建议的默认修改。
   在实际运用中Adam效果非常优秀。
'''
opt = tf.train.AdamOptimizer(lr)


#训练节点
# # 计算梯度
# grad = opt.compute_gradients(self.loss, [self.embedding])

# 直接梯度计算更新
# train_op = opt.minimize(self.loss)

# 进行梯度截断,loss为一个张量
grads_vars = opt.compute_gradients(loss)
# 梯度进行截断(更新)
# 截断值
clip = 5
capped_grads_vars = [[tf.clip_by_value(g, -clip, clip), v] for g, v in grads_vars]

train_op = opt.apply_gradients(capped_grads_vars)



十一、模型存储

saver = tf.train.Saver(max_to_keep=3)
#存储
saver.save(sess, path, global_step=epoch)

# 恢复
# 例如:path = 'D:/ckpt/model.ckpt-9'
saver.restore(sess, path)



版权声明:本文为qq_43655307原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。