您的位置:首页 > 其它

自然语言处理NLP_中文分词_正向最大匹配算法

2020-07-12 16:48 85 查看
"""
正向最大匹配算法
"""
#词典元素存储变量
dict_words = []

#初始化函数,载入词典
def init():
with open("dict.txt","r",encoding="utf-8") as dict_input:
for word in dict_input:
dict_words.append(word.strip())

#分词函数
def cut_words(words_input,dict_words):
#分词结果集
cut_words_list = []

words_input = words_input.strip()
#统计输入系列的长度
words_input_length = len(words_input)

#统计词典的元素的最大长度
max_length_dict_words = max(len(word) for word in dict_words)

while words_input_length > 0:
#找出分词的最大长度
max_cut_length = min(words_input_length,max_length_dict_words)

#切出最长匹配序列
subString_words_input = words_input[0:max_cut_length]

while max_cut_length > 0:
if subString_words_input in dict_words:
cut_words_list.append(subString_words_input)
break
elif max_cut_length == 1:
cut_words_list.append(subString_words_input)
break
else:
max_cut_length -= 1
subString_words_input = words_input[0:max_cut_length]
words_input = words_input[max_cut_length:]
words_input_length -= max_cut_length

#分词结果
result_cut_words = "/".join(cut_words_list)
return result_cut_words

#主函数
def main():
init()
while True:
print("请输入需要切分的序列:")
words_input = input()
if not words_input:
break
result = cut_words(words_input,dict_words)
print("分词结果:")
print(result)

#测试函数
if __name__ == "__main__":
main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: