python

  • Post author:
  • Post category:python


# 数据处理工具

import pandas as pd

import re

# 数据预处理工具

from tenserflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

# 模型搭建

from tenserflow.keras.models import Sequential

from tenserflow.keras.layers import Embedding, Flatten, Dense, GRU, Bidirectional

# 数据读取测试

df = pd.read_csv(“dev.csv”, sep=”\t”)

df.head()

# 查看训练自动长度,合理选择最大长度参数

seq_len = [len(i.split()) for i in df[“title”].to_list()]

pd.Series(seq_len).hist(bins = 30)

max_len = 32

# 数据搭建

# 标签及词汇表

labels, vocabulary = list(df[‘label’].unique()), list(df[‘title’].unique())

# 构造字符级别的特征

string = ”

for word in vocabulary:

string += word

# 词汇表

vocabulary = set(string)

# 词汇表与标签表构建

word_dictionary = {word: i+1 for i, word in enumerate(vocabulary)}

with open(‘word_dict.pk’, ‘wb’) as f:

pickle.dump(word_dictionary, f)

inverse_word_dictionary = {i+1: word for i, word in enumerate(vocabulary)}

label_dictionary = {label: i for i, label in enumerate(labels)}

with open(‘label_dict.pk’, ‘wb’) as f:

pickle.dump(label_dictionary, f)

output_dictionary = {i: labels for i, labels in enumerate(labels)}

vocab_size = len(word_dictionary.keys()) # 词汇表大小

label_size = len(label_dictionary.keys()) # 标签类别数量

# 序列填充,按input_shape填充,长度不足的按0补充

x = [[word_dictionary[word] for word in sent] for sent in df[‘evaluation’]]

x = pad_sequences(maxlen=input_shape, sequences=x, padding=’post’, value=0)

y = [[label_dictionary[sent]] for sent in df[‘label’]]

y = [to_categorical(label, num_classes=label_size) for label in y]

y = np.array([list(_[0]) for _ in y])

# 训练集测试集分割

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.1, random_state = 42)

# 构建模型

model = Sequential()

model.add(Embedding(len(vocabulary)+1, 300, input_length=max_len ))

model.add(Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1, return_sequences = True)))

model.add(Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1)))

model.add(Dense(len(labels), activation=’softmax’))

model.compile(

loss=’categorical_crossentropy’,

optimizer=’adam’,

metrics=[“accuracy”, “loss”])

# 模型训练

history = model.fit(x_train, y_train, epochs=16, batch_size=32, verbose=1)

model.save(“last_weights.h5”)

# 模型预测代码

import pickle

import numpy as np

from tensorflow.keras.models import load_model

from tensorflow.keras.preprocessing.sequence import pad_sequences

import warnings

warnings.filterwarnings(“ignore”)

def main(word_dict, label_dict, model_save_path):

# 导入字典

with open(word_dict, ‘rb’) as f:

word_dictionary = pickle.load(f)

with open(label_dict, ‘rb’) as f:

output_dictionary = pickle.load(f)

try:

# 数据预处理

sent = input(“请输入需要进行情绪分类的文本:”)

x = [[word_dictionary[word] for word in sent]]

x = pad_sequences(maxlen=max_len, sequences=x, padding=’post’, value=0)

# 载入模型

lstm_model = load_model(model_save_path)

# 模型预测

y_predict = lstm_model.predict(x)

label_dict = {v:k for k,v in output_dictionary.items()}

print(‘输入语句: %s\n’ % sent)

print(‘情感预测的分类结果为: %s’ % label_dict[np.argmax(y_predict)])

except KeyError as err:

print(“您输入的句子有汉字不在词汇表中,请重新输入!”)

print(“不在词汇表中的单词为:%s.” % err)


word_dict = “word_dict.pk”

label_dict = “label_dict.pk”

model_save_path = “best_weights.h5”

main(word_dict, label_dict, model_save_path)



版权声明:本文为qq_33807175原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。