哈工大LTP基本使用-分词、词性标注、依存句法分析、命名实体识别、角色标注
上一节我们讲了LTP的基本使用,接下来我们使用其进行事件抽取。
参考代码:https://github.com/liuhuanyong/EventTriplesExtraction
import os from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller class LtpParser: def __init__(self): LTP_DIR = "../model/ltp_data_v3.4.0/" self.segmentor = Segmentor() self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"),os.path.join(LTP_DIR, "user_dict.txt")) self.postagger = Postagger() self.postagger.load_with_lexicon(os.path.join(LTP_DIR, "pos.model"),os.path.join(LTP_DIR, "user_dict.txt")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) '''语义角色标注''' def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = {arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments} return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index+1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '***', 0, 'nh', '总理', 1, 'n'] a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list if __name__ == '__main__': parse = LtpParser() sentence = '中国是一个自由、和平的国家' words, postags, child_dict_list, roles_dict, format_parse_list = parse.parser_main(sentence) print(words, len(words)) print(postags, len(postags)) print(child_dict_list, len(child_dict_list)) print(roles_dict) print(format_parse_list, len(format_parse_list))
结果:
['中国', '是', '一个', '自由', '、', '和平', '的', '国家'] 8 ['ns', 'v', 'm', 'a', 'wp', 'a', 'u', 'n'] 8 [{}, {'SBV': [0], 'VOB': [7]}, {}, {'COO': [5], 'RAD': [6]}, {}, {'WP': [4]}, {}, {'ATT': [2, 3]}] 8 {1: {'A0': ['A0', 0, 0], 'A1': ['A1', 2, 7]}} [['SBV', '中国', 0, 'ns', '是', 1, 'v'], ['HED', '是', 1, 'v', 'Root', -1, 'n'], ['ATT', '一个', 2, 'm', '国家', 7, 'n'], ['ATT', '自由', 3, 'a', '国家', 7, 'n'], ['WP', '、', 4, 'wp', '和平', 5, 'a'], ['COO', '和平', 5, 'a', '自由', 3, 'a'], ['RAD', '的', 6, 'u', '自由', 3, 'a'], ['VOB', '国家', 7, 'n', '是', 1, 'v']] 8
分别说一下每个结果的含义:
分词结果:
['中国', '是', '一个', '自由', '、', '和平', '的', '国家']
词性标注结果;
['ns', 'v', 'm', 'a', 'wp', 'a', 'u', 'n']
依存句法分析结果:
[{}, {'SBV': [0], 'VOB': [7]}, {}, {'COO': [5], 'RAD': [6]}, {}, {'WP': [4]}, {}, {'ATT': [2, 3]}]
注意,该数组的长度是8,对应着分词之后的每一个词。该结果是在原来的句法依存分析结果上进一步处理得到的,最初依存句法分析的结果是:
2:SBV 0:HED 8:ATT 8:ATT 6:WP 4:COO 4:RAD 2:VOB
同时,句法分析中的索引是从1开始的,也就是'中国'对应的是2:SBV,前面2是与中国具有关系的词的索引,SBV是具有的关系名,也就是【中国-是】是主谓关系。我们把每个词对应的关系维护成一个单独的字典。
角色标注结果:
{1: {'A0': ['A0', 0, 0], 'A1': ['A1', 2, 7]}}
整合结果:
[['SBV', '中国', 0, 'ns', '是', 1, 'v'], ['HED', '是', 1, 'v', 'Root', -1, 'n'], ['ATT', '一个', 2, 'm', '国家', 7, 'n'], ['ATT', '自由', 3, 'a', '国家', 7, 'n'], ['WP', '、', 4, 'wp', '和平', 5, 'a'], ['COO', '和平', 5, 'a', '自由', 3, 'a'], ['RAD', '的', 6, 'u', '自由', 3, 'a'], ['VOB', '国家', 7, 'n', '是', 1, 'v']]
这个就是将一个词的相关信息都放到一个列表里面,
from sentence_parser import * import re import os from time import time from pprint import pprint from pyltp import SentenceSplitter, Segmentor, Postagger, Parser from utils import clean_text from collections import Counter class TripleExtractor: def __init__(self): self.parser = LtpParser() '''文章分句处理, 切分长句,冒号,分号,感叹号等做切分标识''' def split_sents(self, content): return [sentence for sentence in re.split(r'[??!!。;;::\n\r]', content) if sentence and '北京银行' in sentence and len(sentence) < 300] '''利用语义角色标注,直接获取主谓宾三元组,基于A0,A1,A2''' def ruler1(self, words, postags, roles_dict, role_index): # words:['中国', '是', '一个', '自由', '、', '和平', '的', '国家'] # postags:['ns', 'v', 'm', 'a', 'wp', 'a', 'u', 'n'] # roles_dict:{1: {'A0': ['A0', 0, 0], 'A1': ['A1', 2, 7]}} # role_index:1 v = words[role_index] # 是 role_info = roles_dict[role_index] if 'A0' in role_info.keys() and 'A1' in role_info.keys(): s = ''.join([words[word_index] for word_index in range(role_info['A0'][1], role_info['A0'][2] + 1) if postags[word_index][0] not in ['w', 'u', 'x'] and words[word_index]]) o = ''.join([words[word_index] for word_index in range(role_info['A1'][1], role_info['A1'][2] + 1) if postags[word_index][0] not in ['w', 'u', 'x'] and words[word_index]]) if s and o: return '1', [s, v, o] # elif 'A0' in role_info: # s = ''.join([words[word_index] for word_index in range(role_info['A0'][1], role_info['A0'][2] + 1) if # postags[word_index][0] not in ['w', 'u', 'x']]) # if s: # return '2', [s, v] # elif 'A1' in role_info: # o = ''.join([words[word_index] for word_index in range(role_info['A1'][1], role_info['A1'][2]+1) if # postags[word_index][0] not in ['w', 'u', 'x']]) # return '3', [v, o] return '4', [] '''三元组抽取主函数''' def ruler2(self, words, postags, child_dict_list, roles_dict, arcs): # words:['中国', '是', '一个', '自由', '、', '和平', '的', '国家'] # postags:['ns', 'v', 'm', 'a', 'wp', 'a', 'u', 'n'] # child_dict_list:[{}, {'SBV': [0], 'VOB': [7]}, {}, {'COO': [5], 'RAD': [6]}, {}, {'WP': [4]}, {}, {'ATT': [2, 3]}] # roles_dict:{1: {'A0': ['A0', 0, 0], 'A1': ['A1', 2, 7]}} # arcs:[['SBV', '中国', 0, 'ns', '是', 1, 'v'], ['HED', '是', 1, 'v', 'Root', -1, 'n'], ['ATT', '一个', 2, 'm', '国家', 7, 'n'], ['ATT', '自由', 3, 'a', '国家', 7, 'n'], ['WP', '、', 4, 'wp', '和平', 5, 'a'], ['COO', '和平', 5, 'a', '自由', 3, 'a'], ['RAD', '的', 6, 'u', '自由', 3, 'a'], ['VOB', '国家', 7, 'n', '是', 1, 'v']] svos = [] for index in range(len(postags)): # [0,1,2,3,4,5,6,7] tmp = 1 # 先借助语义角色标注的结果,进行三元组抽取 if index in roles_dict: # 1 flag, triple = self.ruler1(words, postags, roles_dict, index) if flag == '1': svos.append(triple) tmp = 0 if tmp == 1: # 如果语义角色标记为空,则使用依存句法进行抽取 # if postags[index] == 'v': if postags[index]: # 是 # 抽取以谓词为中心的事实三元组 child_dict = child_dict_list[index] # 主谓宾 # SBV:我送她一束花 (我 <– 送) # VOB:我送她一束花 (送 –> 花) if 'SBV' in child_dict and 'VOB' in child_dict: r = words[index] e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) svos.append([e1, r, e2]) # 定语后置,动宾关系 # ATT:红苹果 (红 <– 苹果) relation = arcs[index][0] head = arcs[index][2] if relation == 'ATT': if 'VOB' in child_dict: e1 = self.complete_e(words, postags, child_dict_list, head - 1) r = words[index] e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) temp_string = r + e2 if temp_string == e1[:len(temp_string)]: e1 = e1[len(temp_string):] if temp_string not in e1: svos.append([e1, r, e2]) # 含有介宾关系的主谓动补关系 # CMP:做完了作业 (做 –> 完) # POB:在贸易区内 (在 –> 内) if 'SBV' in child_dict and 'CMP' in child_dict: e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) cmp_index = child_dict['CMP'][0] r = words[index] + words[cmp_index] if 'POB' in child_dict_list[cmp_index]: e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0]) svos.append([e1, r, e2]) return svos '''对找出的主语或者宾语进行扩展''' def complete_e(self, words, postags, child_dict_list, word_index): child_dict = child_dict_list[word_index] prefix = '' if 'ATT' in child_dict: for i in range(len(child_dict['ATT'])): prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i]) postfix = '' if postags[word_index] == 'v': if 'VOB' in child_dict: postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) if 'SBV' in child_dict: prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix return prefix + words[word_index] + postfix '''程序主控函数''' def triples_main(self, content): # sentences = self.split_sents(content) svos = [] sentence = content # for sentence in sentences: words, postags, child_dict_list, roles_dict, arcs = self.parser.parser_main(sentence) svo = self.ruler2(words, postags, child_dict_list, roles_dict, arcs) svos += svo return svos def test(): extractor = TripleExtractor() contents = [ '中国是一个自由、和平的国家', '他什么书都读', '在贸易区内,他完成了交易', '红色的苹果真好看', '我送她一朵花', '我做完了作业', ] for content in contents: print(extractor.triples_main(content)) test()
具体看注释。
结果:
[['中国', '是', '一个自由和平国家']] [] [['他', '完成', '交易']] [] [['我', '送', '一朵花']] [['我', '做', '作业']]