博主在上一篇
用ltp提取文本关系并创建知识图谱(基于neo4j)(一)
文章中用LTP对单句话做了分析,提取了语义依存关系,并用python在neo4j网站上创建了图谱。本章是对上篇文章的扩展,总体代码类似,本文可以实现对多个句子创建知识图谱,这次创建的不是语义依存关系,而是句法关系(主谓关系,动宾关系等)。
使用ltp提取文本关系:
本文只是简单演示,分析的句子是:
他叫汤姆去拿外衣。汤姆生病了。他去了医院。
你也可以随意替换它。
from ltp import LTP
def ltp_data():
"""将句子处理成语义依存图"""
ltp = LTP()
# 分句
sents = ltp.sent_split(["他叫汤姆去拿外衣.汤姆生病了。他去了医院。"])
# 分词
seg, hidden = ltp.seg(sents)
# 词性标注
pos = ltp.pos(hidden)
# 词性标注
ner = ltp.ner(hidden)
# 语义角色标注
srl = ltp.srl(hidden)
# 依存句法分析
dep = ltp.dep(hidden)
# 语义依存分析(图)
sdp = ltp.sdp(hidden, mode='graph')
return dep, pos, seg
这里我们看一下返回的结果:
if __name__ == '__main__':
ds, pos, seg = ltp_data()
print("语义依存关系:{k}".format(k = ds))
print("标签:{k}".format(k = pos))
print("分句:{k}".format(k = seg))
out:
语义依存关系:[[(1, 2, ‘SBV’), (2, 0, ‘HED’), (3, 2, ‘DBL’), (4, 5, ‘ADV’), (5, 2, ‘VOB’), (6, 5, ‘VOB’), (7, 2, ‘WP’)], [(1, 2, ‘SBV’), (2, 0, ‘HED’), (3, 2, ‘RAD’), (4, 2, ‘WP’)], [(1, 2, ‘SBV’), (2, 0, ‘HED’), (3, 2, ‘RAD’), (4, 2, ‘VOB’), (5, 2, ‘WP’)]]
标签:[[‘r’, ‘v’, ‘nh’, ‘v’, ‘v’, ‘n’, ‘wp’], [‘nh’, ‘v’, ‘u’, ‘wp’], [‘r’, ‘v’, ‘u’, ‘n’, ‘wp’]]
分句:[[‘他’, ‘叫’, ‘汤姆’, ‘去’, ‘拿’, ‘外衣’, ‘.’], [‘汤姆’, ‘生病’, ‘了’, ‘。’], [‘他’, ‘去’, ‘了’, ‘医院’, ‘。’]]
标注和关系的具体含义参考
ltp附录
。
提取节点和关系:
整理上一步返回的结果,从里面提取出节点和关系。
提取节点:
def node_extraction(seg, pos):
"""从语义依存图中提取出节点的名字和节点类型"""
for i in range(len(seg)):
seg[i] = [str(i) for i in seg[i]]
pos[i] = [str(i) for i in pos[i]]
return seg, pos
提取关系时需要用到创建的节点,因此用到了nodes这个参数,它是在后面创建节点函数那里生成的。
提取关系
def relation_extraction(ds,nodes):
pass
"""
提取出节点间的关系,将节点与关于整合成三元组,并存放在列表中。
(node1,node2,relation)
"""
rel = []
for ds_sentence, nodes_sentence in zip(ds, nodes):
rel_sentence = []
for ds_word, nodes_word in zip(ds_sentence, nodes_sentence):
# 根据索引提取出节点和关系
index1 = int(ds_word[0]) - 1
index2 = int(ds_word[1]) - 1
node1 = nodes_sentence[index1]
node2 = nodes_sentence[index2]
relation = ds_word[2]
# 将节点和关系添加到3元组中
rel_word = []
rel_word.append(node1)
rel_word.append(node2)
rel_word.append(relation)
# 将3元组整合到句子中
rel_sentence.append(rel_word)
# 将单句整合到列表中
rel.append(rel_sentence)
return rel
创建节点和关系:
这一步是创建知识图谱,需要先去neo4j连接上,在建立连接那里,第一个参数 是用cmd打开neo4j时生成的网址(http://localhost:7474),第二个参数是用户名,第三个参数是密码。
from py2neo import Node, Graph, Relationship
from ltp_data import ltp_data
# 可以先阅读下文档:https://py2neo.org/v4/index.htm
class DataToNeo4j(object):
"""将excel中数据存入neo4j"""
def __init__(self):
"""建立连接"""
link = Graph("your localhost", username="your username", password="your password")
self.graph = link
# self.graph = NodeMatcher(link)
self.graph.delete_all()
"""
node3 = Node('animal' , name = 'cat')
node4 = Node('animal' , name = 'dog')
node2 = Node('Person' , name = 'Alice')
node1 = Node('Person' , name = 'Bob')
r1 = Relationship(node2 , 'know' , node1)
r2 = Relationship(node1 , 'know' , node3)
r3 = Relationship(node2 , 'has' , node3)
r4 = Relationship(node4 , 'has' , node2)
self.graph.create(node1)
self.graph.create(node2)
self.graph.create(node3)
self.graph.create(node4)
self.graph.create(r1)
self.graph.create(r2)
self.graph.create(r3)
self.graph.create(r4)
"""
def create_node(self, name_node, type_node):
"""建立节点"""
nodes = []
for name_sentence, type_sentence in zip(name_node, type_node):
nodes_sentence = []
for name_word, type_word in zip(name_sentence, type_sentence):
# 创建节点
node = Node(type_word, name = name_word)
self.graph.create(node)
# 保存下来
nodes_sentence.append(node)
nodes.append(nodes_sentence)
print('节点建立成功')
return nodes
def create_relation(self, rel):
"""建立联系"""
for sentence in rel:
for word in sentence:
try:
# 关系要转化成字符串格式
r = Relationship(word[0], str(word[2]), word[1])
self.graph.create(r)
except AttributeError as e:
print(e)
print('关系建立成功')
测试运行
if __name__ == '__main__':
ds, pos, seg = ltp_data()
create_data = DataToNeo4j()
# 建立节点
node_name, node_type = node_extraction(seg, pos)
nodes = create_data.create_node(node_name, node_type)
print("第一句话的节点:\n{k}".format(k = nodes[0]))
# 建立联系
rel = relation_extraction(ds, nodes)
create_data.create_relation(rel)
print("第一句话的关系:\n{k}".format(k = rel[0]))
out:
节点建立成功
第一句话的节点:
[Node(‘r’, name=‘他’), Node(‘v’, name=‘叫’), Node(‘nh’, name=‘汤姆’), Node(‘v’, name=‘去’), Node(‘v’, name=‘拿’), Node(‘n’, name=‘外衣’), Node(‘wp’, name=’.’)]
关系建立成功
第一句话的关系:
[[Node(‘r’, name=‘他’), Node(‘v’, name=‘叫’), ‘SBV’], [Node(‘v’, name=‘叫’), Node(‘wp’, name=’.’), ‘HED’], [Node(‘nh’, name=‘汤姆’), Node(‘v’, name=‘叫’), ‘DBL’], [Node(‘v’, name=‘去’), Node(‘v’, name=‘拿’), ‘ADV’], [Node(‘v’, name=‘拿’), Node(‘v’, name=‘叫’), ‘VOB’], [Node(‘n’, name=‘外衣’), Node(‘v’, name=‘拿’), ‘VOB’], [Node(‘wp’, name=’.’), Node(‘v’, name=‘叫’), ‘WP’]]
效果
换一段话:
农场的一群动物成功地进行了一场革命,将压榨他们的人类东家赶出农场,建立起一个平等的动物社会。然而,动物领袖,那些聪明的猪们最终却篡夺了革命的果实,成为比人类东家更加独裁和极权的统治者。
这里注意一下红色箭头,neo4j默认的节点个数比较少,当建立的节点较多时,可能有些显示不出来,让你误以为没有创建。把这个限制个数调大一点就可以了。
所有代码:
ltp_data.py
from ltp import LTP
def ltp_data():
"""将句子处理成语义依存图"""
ltp = LTP()
# 分句
sents = ltp.sent_split(["他叫汤姆去拿外衣.汤姆生病了。他去了医院。"])
# 分词
seg, hidden = ltp.seg(sents)
# 词性标注
pos = ltp.pos(hidden)
# 词性标注
ner = ltp.ner(hidden)
# 语义角色标注
srl = ltp.srl(hidden)
# 依存句法分析
dep = ltp.dep(hidden)
# 语义依存分析(图)
sdp = ltp.sdp(hidden, mode='graph')
return dep, pos, seg
if __name__ == '__main__':
ds, pos, seg = ltp_data()
print("语义依存关系:{k}".format(k = ds))
print("标签:{k}".format(k = pos))
print("分句:{k}".format(k = seg))
neo4j.py
# -*- coding: utf-8 -*-
from py2neo import Node, Graph, Relationship
from ltp_data import ltp_data
# 可以先阅读下文档:https://py2neo.org/v4/index.htm
class DataToNeo4j(object):
"""将excel中数据存入neo4j"""
def __init__(self):
"""建立连接"""
link = Graph("your localhost", username="your username", password="your password")
self.graph = link
# self.graph = NodeMatcher(link)
self.graph.delete_all()
"""
node3 = Node('animal' , name = 'cat')
node4 = Node('animal' , name = 'dog')
node2 = Node('Person' , name = 'Alice')
node1 = Node('Person' , name = 'Bob')
r1 = Relationship(node2 , 'know' , node1)
r2 = Relationship(node1 , 'know' , node3)
r3 = Relationship(node2 , 'has' , node3)
r4 = Relationship(node4 , 'has' , node2)
self.graph.create(node1)
self.graph.create(node2)
self.graph.create(node3)
self.graph.create(node4)
self.graph.create(r1)
self.graph.create(r2)
self.graph.create(r3)
self.graph.create(r4)
"""
def create_node(self, name_node, type_node):
"""建立节点"""
nodes = []
for name_sentence, type_sentence in zip(name_node, type_node):
nodes_sentence = []
for name_word, type_word in zip(name_sentence, type_sentence):
# 创建节点
node = Node(type_word, name = name_word)
self.graph.create(node)
# 保存下来
nodes_sentence.append(node)
nodes.append(nodes_sentence)
print('节点建立成功')
return nodes
def create_relation(self, rel):
"""建立联系"""
for sentence in rel:
for word in sentence:
try:
# 关系要转化成字符串格式
r = Relationship(word[0], str(word[2]), word[1])
self.graph.create(r)
except AttributeError as e:
print(e)
print('关系建立成功')
def node_extraction(seg, pos):
"""从语义依存图中提取出节点的名字和节点类型"""
for i in range(len(seg)):
seg[i] = [str(i) for i in seg[i]]
pos[i] = [str(i) for i in pos[i]]
return seg, pos
def relation_extraction(ds,nodes):
pass
"""
提取出节点间的关系,将节点与关于整合成三元组,并存放在列表中。
(node1,node2,relation)
"""
rel = []
for ds_sentence, nodes_sentence in zip(ds, nodes):
rel_sentence = []
for ds_word, nodes_word in zip(ds_sentence, nodes_sentence):
# 根据索引提取出节点和关系
index1 = int(ds_word[0]) - 1
index2 = int(ds_word[1]) - 1
node1 = nodes_sentence[index1]
node2 = nodes_sentence[index2]
relation = ds_word[2]
# 将节点和关系添加到3元组中
rel_word = []
rel_word.append(node1)
rel_word.append(node2)
rel_word.append(relation)
# 将3元组整合到句子中
rel_sentence.append(rel_word)
# 将单句整合到列表中
rel.append(rel_sentence)
return rel
if __name__ == '__main__':
ds, pos, seg = ltp_data()
create_data = DataToNeo4j()
# 建立节点
node_name, node_type = node_extraction(seg, pos)
nodes = create_data.create_node(node_name, node_type)
print("第一句话的节点:\n{k}".format(k = nodes[0]))
# 建立联系
rel = relation_extraction(ds, nodes)
create_data.create_relation(rel)
print("第一句话的关系:\n{k}".format(k = rel[0]))