python正向最大匹配分词和逆向最大匹配分词
2017-12-13 10:13
417 查看
正向最大匹配
# -*- coding:utf-8 -*-
CODEC='utf-8'
def u(s, encoding):
'converted other encoding to unicode encoding'
if isinstance(s, unicode):
return s
else:
return unicode(s, encoding)
def fwd_mm_seg(wordDict, maxLen, str):
'forward max match segment'
wordList = []
segStr = str
segStrLen = len(segStr)
for word in wordDict:
print 'word: ', word
print "\n"
while segStrLen > 0:
if segStrLen > maxLen:
wordLen = maxLen
else:
wordLen = segStrLen
subStr = segStr[0:wordLen]
print "subStr: ", subStr
while wordLen > 1:
if subStr in wordDict:
print "subStr1: %r" % subStr
break
else:
print "subStr2: %r" % subStr
wordLen = wordLen - 1
subStr = subStr[0:wordLen]
# print "subStr3: ", subStr
wordList.append(subStr)
segStr = segStr[wordLen:]
segStrLen = segStrLen - wordLen
for wordstr in wordList:
print "wordstr: ", wordstr
return wordList
def main():
fp_dict = open('words.dic')
wordDict = {}
for eachWord in fp_dict:
wordDict[u(eachWord.strip(), 'utf-8')] = 1
segStr = u'你好世界hello world'
print segStr
wordList = fwd_mm_seg(wordDict, 10, segStr)
print "==".join(wordList)
if __name__ == '__main__':
main()
逆向最大匹配
# -*- coding:utf-8 -*-
CODEC='utf-8'
def u(s, encoding):
'converted other encoding to unicode encoding'
if isinstance(s, unicode):
return s
else:
return unicode(s, encoding)
def fwd_mm_seg(wordDict, maxLen, str):
'forward max match segment'
wordList = []
segStr = str
segStrLen = len(segStr)
for word in wordDict:
print 'word: ', word
print "\n"
while segStrLen > 0:
if segStrLen > maxLen:
wordLen = maxLen
else:
wordLen = segStrLen
subStr = segStr[0:wordLen]
print "subStr: ", subStr
while wordLen > 1:
if subStr in wordDict:
print "subStr1: %r" % subStr
break
else:
print "subStr2: %r" % subStr
wordLen = wordLen - 1
subStr = subStr[0:wordLen]
# print "subStr3: ", subStr
wordList.append(subStr)
segStr = segStr[wordLen:]
segStrLen = segStrLen - wordLen
for wordstr in wordList:
print "wordstr: ", wordstr
return wordList
def main():
fp_dict = open('words.dic')
wordDict = {}
for eachWord in fp_dict:
wordDict[u(eachWord.strip(), 'utf-8')] = 1
segStr = u'你好世界hello world'
print segStr
wordList = fwd_mm_seg(wordDict, 10, segStr)
print "==".join(wordList)
if __name__ == '__main__':
main()
逆向最大匹配
# -*- coding:utf-8 -*- def u(s, encoding): 'converted other encoding to unicode encoding' if isinstance(s, unicode): return s else: return unicode(s, encoding) CODEC='utf-8' def bwd_mm_seg(wordDict, maxLen, str): 'forward max match segment' wordList = [] segStr = str segStrLen = len(segStr) for word in wordDict: print 'word: ', word print "\n" while segStrLen > 0: if segStrLen > maxLen: wordLen = maxLen else: wordLen = segStrLen subStr = segStr[-wordLen:None] print "subStr: ", subStr while wordLen > 1: if subStr in wordDict: print "subStr1: %r" % subStr break else: print "subStr2: %r" % subStr wordLen = wordLen - 1 subStr = subStr[-wordLen:None] # print "subStr3: ", subStr wordList.append(subStr) segStr = segStr[0: -wordLen] segStrLen = segStrLen - wordLen wordList.reverse() for wordstr in wordList: print "wordstr: ", wordstr return wordList def main(): fp_dict = open('words.dic') wordDict = {} for eachWord in fp_dict: wordDict[u(eachWord.strip(), 'utf-8')] = 1 segStr = ur'你好世界hello world' print segStr wordList = bwd_mm_seg(wordDict, 10, segStr) print "==".join(wordList) if __name__ == '__main__': main()
相关文章推荐
- python正向最大匹配分词和逆向最大匹配分词的实例
- python 实现机械分词(1)-正向最大匹配算法
- 中文分词基础原则及正向最大匹配法、逆向最大匹配法、双向最大匹配法的分析
- 正向(逆向)最大匹配和最大概率法分词的错误分析
- 分词算法的python实现(正向最大匹配法)
- python实现机械分词之逆向最大匹配算法代码示例
- 中文分词引擎 java 实现 — 正向最大、逆向最大、双向最大匹配法
- 中文分词基础原则及正向最大匹配法、逆向最大匹配法、双向最大匹配法的分析
- 中文分词中的正向最大匹配与逆向最大匹配
- 中文分词算法之最大正向匹配算法(Python版)
- 中文分词算法之最大正向匹配算法(Python版)
- 简单的逆向最大匹配算法实现中文分词(Python)
- 用正向和逆向最大匹配算法进行中文分词
- python中文分词教程之前向最大正向匹配算法详解
- python 中文分词:正向最大匹配
- (2)中文分词——最大正向匹配算法及MMSEG分词算法
- 自然语言处理 最大逆向匹配分词算法
- NLP中文信息处理---正向最大匹配法分词
- 逆向最大匹配分词算法
- NLP——分词之正向(逆向、双向)最大