数据转换Word2Vec和Doc2Vec

  • Post author:
  • Post category:其他




Gensim 构建词袋模型

import jieba
#定义停用词、标点符号
punctuation = [",","。",":", ";", "?"]
#定义语料
content = ["机器学习带动人工智能飞速的发展。",
           "深度学习带动人工智能飞速的发展。",
           "机器学习和深度学习带动人工智能飞速的发展。"
          ]
#分词
segs_1 = [jieba.lcut(con) for con in content]
# 去停用词和标点符号
tokenized = []
for sentence in segs_1:
    words = []
    for word in sentence:
        if word not in punctuation:
            words.append(word)
    tokenized.append(words)
#求并集
bag_of_words = [ x for item in segs_1 for x in item if x not in punctuation]
#去重
bag_of_words = list(set(bag_of_words))
bag_of_word2vec = []
for sentence in tokenized:
    tokens = [1 if token in sentence else 0 for token in bag_of_words ]
    bag_of_word2vec.append(tokens)
from gensim import corpora
import gensim
#tokenized是去标点之后的
dictionary = corpora.Dictionary(tokenized)
# 保存词典
dictionary.save('deerwester.dict')
# 查看词典和下标 id 的映射
print(dictionary.token2id)
corpus = [dictionary.doc2bow(sentence) for sentence in segs_1]
print(corpus)

运行结果如下:

{‘人工智能’: 0, ‘发展’: 1, ‘学习’: 2, ‘带动’: 3, ‘机器’: 4, ‘的’: 5, ‘飞速’: 6, ‘深度’: 7, ‘和’: 8}

稀疏向量结果如下:

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (5, 1), (6, 1), (7, 1)], [(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]]



词向量模型 Word2Vec

from gensim.models import Word2Vec
import jieba
#定义停用词、标点符号
punctuation = [",", "。", ":", ";", ".", "'", '"', "’", "?", "/", "-", "+", "&", "(", ")"]
sentences = [
"长江是中国第一大河,干流全长6397公里(以沱沱河为源),一般称6300公里。流域总面积一百八十余万平方公里,年平均入海水量约九千六百余亿立方米。以干流长度和入海水量论,长江均居世界第三位。",
"黄河,中国古代也称河,发源于中华人民共和国青海省巴颜喀拉山脉,流经青海、四川、甘肃、宁夏、内蒙古、陕西、山西、河南、山东9个省区,最后于山东省东营垦利县注入渤海。干流河道全长5464千米,仅次于长江,为中国第二长河。黄河还是世界第五长河。",
"黄河,是中华民族的母亲河。作为中华文明的发祥地,维系炎黄子孙的血脉.是中华民族民族精神与民族情感的象征。",
"黄河被称为中华文明的母亲河。公元前2000多年华夏族在黄河领域的中原地区形成、繁衍。",
"在兰州的“黄河第一桥”内蒙古托克托县河口镇以上的黄河河段为黄河上游。",
"黄河上游根据河道特性的不同,又可分为河源段、峡谷段和冲积平原三部分。 ",
"黄河,是中华民族的母亲河。"
]
# 分词,去标点符号、停用词
sentences = [jieba.lcut(sen) for sen in sentences]
tokenized = []
for sentence in sentences:
    words = []
    for word in sentence:
        if word not in punctuation:
            words.append(word)
    tokenized.append(words)
# 进行模型训练
model = Word2Vec(tokenized, sg=1, size=100,  window=5,  min_count=2,  negative=1, sample=0.001, hs=1, workers=4)
model.save('model')  #保存模型
model = Word2Vec.load('model')   #加载模型
print(model.similarity(u'黄河', u'长江'))
print(model.most_similar(positive=[u'黄河', u'母亲河'], negative=[u'长江']))



将词变成词向量的工具:Doc2Vec

import jieba
#定义停用词、标点符号
punctuation = [",", "。", ":", ";", ".", "'", '"', "’", "?", "/", "-", "+", "&", "(", ")"]
sentences = [
"长江是中国第一大河,干流全长6397公里(以沱沱河为源),一般称6300公里。流域总面积一百八十余万平方公里,年平均入海水量约九千六百余亿立方米。以干流长度和入海水量论,长江均居世界第三位。",
"黄河,中国古代也称河,发源于中华人民共和国青海省巴颜喀拉山脉,流经青海、四川、甘肃、宁夏、内蒙古、陕西、山西、河南、山东9个省区,最后于山东省东营垦利县注入渤海。干流河道全长5464千米,仅次于长江,为中国第二长河。黄河还是世界第五长河。",
"黄河,是中华民族的母亲河。作为中华文明的发祥地,维系炎黄子孙的血脉.是中华民族民族精神与民族情感的象征。",
"黄河被称为中华文明的母亲河。公元前2000多年华夏族在黄河领域的中原地区形成、繁衍。",
"在兰州的“黄河第一桥”内蒙古托克托县河口镇以上的黄河河段为黄河上游。",
"黄河上游根据河道特性的不同,又可分为河源段、峡谷段和冲积平原三部分。 ",
"黄河,是中华民族的母亲河。"
]
# 分词,去标点符号、停用词
sentences = [jieba.lcut(sen) for sen in sentences]
tokenized = []
for sentence in sentences:
    words = []
    for word in sentence:
        if word not in punctuation:
            words.append(word)
    tokenized.append(words)

# 定义数据预处理类,作用是给每个文章添加对应的标签
from gensim.models.doc2vec import Doc2Vec,LabeledSentence
doc_labels = ["长江","黄河","黄河","黄河","黄河","黄河","黄河"]

class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
       self.labels_list = labels_list
       self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield LabeledSentence(words=doc,tags=[self.labels_list[idx]])

iter_data = LabeledLineSentence(tokenized, doc_labels)
model = Doc2Vec(dm=1, size=100, window=8, min_count=5, workers=4)
model.build_vocab(iter_data)
model.train(iter_data,total_examples=model.corpus_count,epochs=1000,start_alpha=0.01,end_alpha =0.001)
# 根据标签找最相似的,这里只有黄河和长江,所以结果为长江,并计算出了相似度
print(model.docvecs.most_similar('黄河'))
print(model.docvecs.similarity('黄河', '长江'))



版权声明:本文为lhxsir原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。