您的位置:首页 > 编程语言

一个简单问答系统搭建代码(附步骤流程)

2020-07-13 06:01 555 查看

首先是这个简单问答系统搭建的步骤流程:

因为我这里读文件用的是绝对路径,所以要用的同学需要自己改一下,代码中涉及到的文件有需要的点以下下载:
链接:https://pan.baidu.com/s/14srw2A_RIgTc7ejP3KJQaQ
提取码:n4n5

import json
from collections import Counter
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import numpy as np
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import math
from collections import defaultdict
from queue import PriorityQueue
#对文本进行问题和答案分开对应
def read_corpus(filepath):
qlist = []
alist = []
with open(filepath) as f:
file_array = json.load(f)['data']
for file in file_array:
paragraph = file['paragraphs']
for paragraphs in paragraph:
paragraph = paragraphs['qas']
for qa in paragraph:
qlist.append(qa['question'])
try:
alist.append(qa['answers'][0]['text'])
except IndexError:
qlist.pop()
assert len(qlist) == len(alist)  # 确保长度一样
return qlist, alist

def text_preprocessing(text):
# 生成停用词和标准化
stopfile_path = r'C:\Users\Administrator\nltk_data\corpora\stopwords\baidu_stopwords.txt'
with open(stopfile_path, 'r', encoding='UTF-8') as f:
sw = set(f.read())
sw -= {'when', 'who', 'why', 'what', 'how', 'where', 'which'}
ps = PorterStemmer()

seg = list()
#用nltk分词
for word in word_tokenize(text):
#小写化,次干提取
word = ps.stem(word.lower())
#数值归一
word = '#number' if word.isdigit() else word
#去停用词
if len(word)>1 and word not in sw:
seg.append(word)
return seg
#对问题库进行文本处理
def qlist_preprocessing(qlist):
word_cnt = Counter()
qlist_seg = list()
for text in qlist:
seg = text_preprocessing(text)
qlist_seg.append(seg)
word_cnt.update(seg)

value_sort = sorted(word_cnt.values(),reverse=True)
min_tf = value_sort[int(math.exp(0.99*math.log(len(word_cnt))))]
for cur in range(len(qlist_seg)):
qlist_seg[cur] = [word for word in qlist_seg[cur] if word_cnt[word] > min_tf]
return qlist_seg

# word2vec
model = KeyedVectors.load_word2vec_format(r'C:\Users\Administrator\Desktop\glove2word2vec.6B.100d.txt')
#将分词数据转换成句向量
def docvec_get(seg):
vector = np.zeros((1,100))
size = len(seg)
for word in seg:
try:
vector += model.wv[word]
except KeyError:
size -= 1
return vector/size

def top5results_invidx(input_q):
qlist, alist = read_corpus(r'C:\Users\Administrator\Desktop\train-v2.0.json')
alist = np.array(alist)
qlist_seg = qlist_preprocessing(qlist)  #对qlist进行处理
seg = text_preprocessing(input_q)   #对输入的问题进行处理

#将已经处理好的问题库数据词向量化
X = np.zeros((len(qlist_seg),100))
for cur in range(X.shape[0]):
X[cur] = docvec_get(qlist_seg[cur])
Nnorm2 = np.linalg.norm(X,axis=1,keepdims=True)
X = X / Nnorm2

#定义一个简单的倒排表
inversed_idx = defaultdict(set)
for cur in range(len(qlist_seg)):
for word in qlist_seg[cur]:
inversed_idx[word].add(cur)
candiates = set()
for word in seg:
#取所有包含任意一词的文档的并集
candiates = candiates | inversed_idx[word]
candiates = list(candiates)
#将预处理完的输出问题词向量化
input_vec = docvec_get(seg)
#计算问题向量的L2范数
qnorm2 = np.linalg.norm(input_vec,axis=1,keepdims=True)
input_vec = input_vec / qnorm2
sim = (X[candiates] @ input_vec.T)     #计算余弦相似度,@表示矩阵向量乘法

#用优先队列找出相似度最大的5个问题对应的答案的下标
pq = PriorityQueue()
for cur in range(sim.shape[0]):
pq.put((sim[cur][0], candiates[cur]))
if len(pq.queue) > 5:
pq.get()
pq_rank = sorted(pq.queue, reverse=True, key=lambda x: x[0])
print([x[0] for x in pq_rank])
top_idxs = [x[1] for x in pq_rank]
return [alist[i] for i in top_idxs] # 返回相似度最高的问题对应的答案,作为TOP5答案
print(top5results_emb("Which airport was shut down?"))    # 在问题库中存在,经过对比,返回的首结果正确
print(top5results_emb("Which airport is closed?"))
print(top5results_emb("What government blocked aid after Cyclone Nargis?"))    # 在问题库中存在,经过对比,返回的首结果正确
print(top5results_emb("Which government stopped aid after Hurricane Nargis?"))
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: