# 数据处理工具
import pandas as pd
import re
# 数据预处理工具
from tenserflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
# 模型搭建
from tenserflow.keras.models import Sequential
from tenserflow.keras.layers import Embedding, Flatten, Dense, GRU, Bidirectional
# 数据读取测试
df = pd.read_csv(“dev.csv”, sep=”\t”)
df.head()
# 查看训练自动长度,合理选择最大长度参数
seq_len = [len(i.split()) for i in df[“title”].to_list()]
pd.Series(seq_len).hist(bins = 30)
max_len = 32
# 数据搭建
# 标签及词汇表
labels, vocabulary = list(df[‘label’].unique()), list(df[‘title’].unique())
# 构造字符级别的特征
string = ”
for word in vocabulary:
string += word
# 词汇表
vocabulary = set(string)
# 词汇表与标签表构建
word_dictionary = {word: i+1 for i, word in enumerate(vocabulary)}
with open(‘word_dict.pk’, ‘wb’) as f:
pickle.dump(word_dictionary, f)
inverse_word_dictionary = {i+1: word for i, word in enumerate(vocabulary)}
label_dictionary = {label: i for i, label in enumerate(labels)}
with open(‘label_dict.pk’, ‘wb’) as f:
pickle.dump(label_dictionary, f)
output_dictionary = {i: labels for i, labels in enumerate(labels)}
vocab_size = len(word_dictionary.keys()) # 词汇表大小
label_size = len(label_dictionary.keys()) # 标签类别数量
# 序列填充,按input_shape填充,长度不足的按0补充
x = [[word_dictionary[word] for word in sent] for sent in df[‘evaluation’]]
x = pad_sequences(maxlen=input_shape, sequences=x, padding=’post’, value=0)
y = [[label_dictionary[sent]] for sent in df[‘label’]]
y = [to_categorical(label, num_classes=label_size) for label in y]
y = np.array([list(_[0]) for _ in y])
# 训练集测试集分割
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.1, random_state = 42)
# 构建模型
model = Sequential()
model.add(Embedding(len(vocabulary)+1, 300, input_length=max_len ))
model.add(Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1, return_sequences = True)))
model.add(Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1)))
model.add(Dense(len(labels), activation=’softmax’))
model.compile(
loss=’categorical_crossentropy’,
optimizer=’adam’,
metrics=[“accuracy”, “loss”])
# 模型训练
history = model.fit(x_train, y_train, epochs=16, batch_size=32, verbose=1)
model.save(“last_weights.h5”)
# 模型预测代码
import pickle
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import warnings
warnings.filterwarnings(“ignore”)
def main(word_dict, label_dict, model_save_path):
# 导入字典
with open(word_dict, ‘rb’) as f:
word_dictionary = pickle.load(f)
with open(label_dict, ‘rb’) as f:
output_dictionary = pickle.load(f)
try:
# 数据预处理
sent = input(“请输入需要进行情绪分类的文本:”)
x = [[word_dictionary[word] for word in sent]]
x = pad_sequences(maxlen=max_len, sequences=x, padding=’post’, value=0)
# 载入模型
lstm_model = load_model(model_save_path)
# 模型预测
y_predict = lstm_model.predict(x)
label_dict = {v:k for k,v in output_dictionary.items()}
print(‘输入语句: %s\n’ % sent)
print(‘情感预测的分类结果为: %s’ % label_dict[np.argmax(y_predict)])
except KeyError as err:
print(“您输入的句子有汉字不在词汇表中,请重新输入!”)
print(“不在词汇表中的单词为:%s.” % err)
word_dict = “word_dict.pk”
label_dict = “label_dict.pk”
model_save_path = “best_weights.h5”
main(word_dict, label_dict, model_save_path)