基于CRF的实体识别
2019-02-26 17:05
204 查看
实体识别在实际生活中具有很大的 ,如识别一段文字中的人名,从而为构建知识图谱具有很重要的基础作用。常见的实体识别主要包括人名、地名、时间和组织机构;也可以根据业务的需求构建相应的实体,本文以CRF模型为理论支撑,利用人民日报的语料进行人名、地名、时间以及组织机构识别,从而对一段冗长的信息中提取出所需要的实体信息。
Crf的理论可以参考其资料进行阅读,本文主要是用于学习笔记以及后续其他业务方向的需求做一个技术的基础实践
import re import sklearn_crfsuite#pip install python-crfsuite from sklearn_crfsuite import metrics from sklearn.externals import joblib import pycrfsuite """初始化""" train_corpus_path = "D:\workspace\project\\NLPcase\\ner\\data\\199801.txt" process_corpus_path = "D:\workspace\project\\NLPcase\\ner\\data//result-rmrb.txt" _maps = {u't': u'T',u'nr': u'PER', u'ns': u'ORG',u'nt': u'LOC'} def read_corpus_from_file(file_path): """读取语料""" f = open(train_corpus_path, 'r')#,encoding='utf-8' lines = f.readlines() f.close() return lines def write_corpus_to_file(data, file_path): """写语料""" f = open(file_path, 'wb') f.write(data) f.close() def q_to_b(q_str): """全角转半角""" b_str = "" for uchar in q_str: inside_code = ord(uchar) if inside_code == 12288: # 全角空格直接转换 inside_code = 32 elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 inside_code -= 65248 b_str += chr(inside_code) return b_str def b_to_q(b_str): """半角转全角""" q_str = "" for uchar in b_str: inside_code = ord(uchar) if inside_code == 32: # 半角空格直接转化 inside_code = 12288 elif 126 >= inside_code >= 32: # 半角字符(除空格)根据关系转化 inside_code += 65248 q_str += chr(inside_code) return q_str def pre_process(): """语料预处理 """ lines = read_corpus_from_file(train_corpus_path) new_lines = [] flag = 0 for line in lines: flag +=1 words = q_to_b(line.strip()).split(u' ') pro_words = process_t(words) pro_words = process_nr(pro_words) pro_words = process_k(pro_words) new_lines.append(' '.join(pro_words[1:])) if flag==100: break write_corpus_to_file(data='\n'.join(new_lines).encode('utf-8'), file_path=process_corpus_path) def process_k( words): """处理大粒度分词,合并语料库中括号中的大粒度分词,类似:[国家/n 环保局/n]nt """ pro_words = [] index = 0 temp = u'' while True: word = words[index] if index < len(words) else u'' if u'[' in word: temp += re.sub(pattern=u'/[a-zA-Z]*', repl=u'', string=word.replace(u'[', u'')) elif u']' in word: w = word.split(u']') temp += re.sub(pattern=u'/[a-zA-Z]*', repl=u'', string=w[0]) pro_words.append(temp+u'/'+w[1]) temp = u'' elif temp: temp += re.sub(pattern=u'/[a-zA-Z]*', repl=u'', string=word) elif word: pro_words.append(word) else: break index += 1 return pro_words def process_nr( words): """ 处理姓名,合并语料库分开标注的姓和名,类似:温/nr 家宝/nr""" pro_words = [] index = 0 while True: word = words[index] if index < len(words) else u'' if u'/nr' in word: next_index = index + 1 if next_index < len(words) and u'/nr' in words[next_index]: pro_words.append(word.replace(u'/nr', u'') + words[next_index]) index = next_index else: pro_words.append(word) elif word: pro_words.append(word) else: break index += 1 return pro_words def process_t( words): """处理时间,合并语料库分开标注的时间词,类似: (/w 一九九七年/t 十二月/t 三十一日/t )/w """ pro_words = [] index = 0 temp = u'' while True: word = words[index] if index < len(words) else u'' if u'/t' in word: temp = temp.replace(u'/t', u'') + word elif temp: pro_words.append(temp) pro_words.append(word) temp = u'' elif word: pro_words.append(word) else: break index += 1 return pro_words def pos_to_tag( p): """由词性提取标签""" t = _maps.get(p, None) return t if t else u'O' def tag_perform( tag, index): """标签使用BIO模式""" if index == 0 and tag != u'O': return u'B_{}'.format(tag) elif tag != u'O': return u'I_{}'.format(tag) else: return tag def pos_perform( pos): """去除词性携带的标签先验知识""" if pos in _maps.keys() and pos != u't': return u'n' else: return pos def initialize(): """初始化 """ lines = read_corpus_from_file(process_corpus_path) words_list = [line.strip().split(' ') for line in lines if line.strip()] del lines # init_sequence(words_list) return init_sequence(words_list) def init_sequence(words_list): """初始化字序列、词性序列、标记序列 """ words_seq = [[word.split(u'/')[0] for word in words] for words in words_list] pos_seq = [[word.split(u'/')[1] for word in words] for words in words_list] tag_seq = [[pos_to_tag(p) for p in pos] for pos in pos_seq] pos_seq = [[[pos_seq[index][i] for _ in range(len(words_seq[index][i]))] for i in range(len(pos_seq[index]))] for index in range(len(pos_seq))] tag_seq = [[[tag_perform(tag_seq[index][i], w) for w in range(len(words_seq[index][i]))] for i in range(len(tag_seq[index]))] for index in range(len(tag_seq))] pos_seq = [[u'un']+[pos_perform(p) for pos in pos_seq for p in pos]+[u'un'] for pos_seq in pos_seq] tag_seq = [[t for tag in tag_seq for t in tag] for tag_seq in tag_seq] word_seq = [[u'<BOS>']+[w for word in word_seq for w in word]+[u'<EOS>'] for word_seq in words_seq] return pos_seq,tag_seq,word_seq pre_process() pos_seq,tag_seq,word_seq = initialize() def extract_feature( word_grams): """特征选取""" features, feature_list = [], [] for index in range(len(word_grams)): for i in range(len(word_grams[index])): word_gram = word_grams[index][i] feature = {u'w-1': word_gram[0], u'w': word_gram[1], u'w+1': word_gram[2], u'w-1:w': word_gram[0]+word_gram[1], u'w:w+1': word_gram[1]+word_gram[2], # u'p-1': self.pos_seq[index][i], u'p': self.pos_seq[index][i+1], # u'p+1': self.pos_seq[index][i+2], # u'p-1:p': self.pos_seq[index][i]+self.pos_seq[index][i+1], # u'p:p+1': self.pos_seq[index][i+1]+self.pos_seq[index][i+2], u'bias': 1.0} feature_list.append(feature) features.append(feature_list) feature_list = [] return features def segment_by_window( words_list=None, window=3): """窗口切分""" words = [] begin, end = 0, window for _ in range(1, len(words_list)): if end > len(words_list): break words.append(words_list[begin:end]) begin = begin + 1 end = end + 1 return words def generator(): """训练数据""" word_grams = [segment_by_window(word_list) for word_list in word_seq] features = extract_feature(word_grams) return features, tag_seq #------------------# 最为主要是构造每个单词的feature与观测序列tag对应------------------- '''初始化参数''' algorithm = 'lbfgs' c1 = "0.1" c2 = "0.1" max_iterations = 100 model_path = "D:\workspace\project\\NLPcase\\ner\\model\\model.pkl" model = sklearn_crfsuite.CRF(algorithm=algorithm, c1=c1, c2=c2, max_iterations=max_iterations, all_possible_transitions=True) def save_model(model,model_path): """保存模型""" joblib.dump(model, model_path) def load_model(model_path): """保存模型""" return joblib.load(model_path) # 对模型进行训练 def train(model_path): x,y = generator() x_train, y_train = x[500:], y[500:] x_test, y_test = x[:500], y[:500] model.fit(x_train, y_train) labels = list(model.classes_) labels.remove('O') y_predict = model.predict(x_test) metrics.flat_f1_score(y_test, y_predict, average='weighted', labels=labels) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) save_model(model,model_path) def predict(sent): model = load_model(model_path) u_sent = q_to_b(sent) word_lists = [[u'<BOS>']+[c for c in u_sent]+[u'<EOS>']] word_grams = [segment_by_window(word_list) for word_list in word_lists] features = extract_feature(word_grams) y_predict = model.predict(features) entity = u'' for index in range(len(y_predict[0])): if y_predict[0][index] != u'O': if index>0 and y_predict[0][index][-1] != y_predict[0][index-1][-1]: entity += u' ' entity += u_sent[index] elif entity[-1] != u' ': entity += u' ' return entity
参考资料
https://blog.csdn.net/leitouguan8655/article/details/83382412
https://blog.csdn.net/lhxsir/article/details/83387240
相关文章推荐
- 基于深度学习的命名实体识别bi-lstm+crf
- 基于条件随机场(CRF)的命名实体识别
- 基于crf的命名实体识别的一部分总结加文本分类大致流程
- 基于条件随机场(CRF)的组织机构实体识别
- 基于条件随机场(CRF)的组织机构实体识别
- 基于CRF工具的机器学习方法命名实体识别的过
- 【转】基于VSM的命名实体识别、歧义消解和指代消解
- 基于深层神经网络的命名实体识别技术
- 基于深层神经网络的命名实体识别技术
- CRF++实体识别
- 基于深层神经网络的命名实体识别技术
- 命名实体识别(named entity recognition )基于统计方法的技术比较
- 【工程处理技巧一篇】基于半规则数据的命名实体消歧识别【未完】
- 基于FOFE的命名实体识别局部检测方法
- 【文智背后的奥秘】系列篇——基于CRF的人名识别
- CRF++地名实体识别(特征为词性和词)
- CRF命名实体识别的一些tips
- 基于分布式的短文本命题实体识别之----人名识别(python实现)
- 通俗理解BiLSTM-CRF命名实体识别模型中的CRF层
- 基于规则的命名实体识别