您的位置：首页 > 编程语言 > Python开发

以文件为单位的分句、分词python封装脚本

2016-03-20 15:52 363 查看

利用nlpir进行分词时，需要考虑以下两个问题：

1、如何分句、分段

2、如何表示分词结果

在网上找了几个分句的脚本，感觉都有问题，就只好自己写了，虽然比较简单，但如果要做到面面俱到还是需要仔细考虑，标注采用xml文件，包括article、paragraph、sentence三层结构，代码已注释，大家自己看吧，这个脚本经过了几次测试，应该可以应付大多数正常的文本文件，如果有问题，欢迎反馈。

解析结果如下

<?xml version="1.0" encoding="utf-8"?>



<Article>

<Paragraph id="0">

<Sentence context="小说课上，正讲着小说，我停下来发问：“爱的反面是什么！" id="0">

<word context="小说" pos="n"/>

<word context="课" pos="n"/>

<word context="上" pos="f"/>

<word context="，" pos="wd"/>

<word context="正" pos="d"/>

<word context="讲" pos="v"/>

<word context="着" pos="uzhe"/>

<word context="小说" pos="n"/>

<word context="，" pos="wd"/>

<word context="我" pos="rr"/>

<word context="停" pos="vi"/>

<word context="下来" pos="vf"/>

<word context="发问" pos="vi"/>

<word context="：" pos="wp"/>

<word context="“" pos="wyz"/>

<word context="爱" pos="v"/>

<word context="的" pos="ude1"/>

<word context="反面" pos="n"/>

<word context="是" pos="vshi"/>

<word context="什么" pos="ry"/>

<word context="！" pos="wt"/>

</Sentence>

<Sentence context="”“恨！" id="1">

<word context="”" pos="wyy"/>

<word context="“" pos="wyz"/>

<word context="恨" pos="v"/>

<word context="！" pos="wt"/>

</Sentence>

代码如下：

# -*- coding: utf8 -*-
__author__ = 'luoshaowei<luoshaowei@163.com>'
import nlpir
import os
from xml.dom import minidom

cutlist = '。！？'.decode('utf8')
# 添加根节点
def AddRoot(doc):
doc.appendChild(doc.createComment('分词结果'.decode('utf8')))
article=doc.createElement('Article')
doc.appendChild(article)
#添加段落节点
def AddParagraph(doc,id):
parentnode=doc.documentElement
node=doc.createElement('Paragraph')
node.setAttribute('id',str(id))
parentnode.appendChild(node)
# 将句子相关信息写入节点
def AddSentence(doc,parentnode,id,context,dividelist):
snode=doc.createElement('Sentence')
snode.setAttribute('id',str(id))
snode.setAttribute('context',context)
for word in dividelist:
wordnode=doc.createElement('word')
wordnode.setAttribute('context',word[0].decode('utf-8'))
wordnode.setAttribute('pos',word[1])
snode.appendChild(wordnode)

parentnode.appendChild(snode)
# 根据id获取当前段落节点
def GetParageaphbyid(doc,id):
pnode=''
for node in doc.getElementsByTagName('Paragraph'):
if node.getAttribute('id')==str(id):
pnode=node
break
return pnode
# 判断是否是段落结尾，依据：该行文本以结束标识符及换行符结尾
def IsParagraphEnd(line):
t=False
if(FindToken(cutlist,line[-2]) and line[-1]=='\n'):
t=True
return t

#检查某字符是否分句标志符号的函数；如果是，返回True，否则返回False
def FindToken(cutlist, char):
if char in cutlist:
return True
else:
return False

# 以文件为单位分句,指定源文件及目标文件
def divide_sentence(sourcefile,destfile):
fps = open(sourcefile)
fpd = open(destfile, 'w')
xmldoc=minidom.Document()
AddRoot(xmldoc)
paragraphid=0
sentenceid=0
linenum=0
sentencelist = []
tempsentence = ''
isparaend=False
isarticleend=False
try:
lines=fps.readlines()
for line in lines:
linenum+=1
if(linenum==len(lines)):
isarticleend=True
line=line.decode('gbk')
# 判断是否空行，如果是则跳过
if(len(line)<=1):
continue
# 判断本行是否是段落结尾或文章结尾
# 如果是则添加段落节点并将段落结束标识置为真
if(IsParagraphEnd(line) or isarticleend):
AddParagraph(xmldoc,paragraphid)
isparaend=True
# 将读入的每行文本去除结尾换行符并去除行首空格
line=line.strip('\n')
line=line.lstrip()
for word in line:
tempsentence=tempsentence+word
# 查找句子结束标识，并将找到的句子加入句子列表
if (FindToken(cutlist, word)):
sentencelist.append(tempsentence)
tempsentence = ''
# 如果本行已是文件最后一行且临时句子缓存不为空，则将缓存加入句子列表
if(isarticleend and tempsentence!=''):
sentencelist.append(tempsentence)
# 如果句子列表不为空并且（已到段落结尾或文件结尾），则填充段落节点
if(sentencelist!=[] and (isparaend or isarticleend)):
paranode=GetParageaphbyid(xmldoc,paragraphid)
for sen in sentencelist:
wordlist=nlpir.seg(sen.encode('utf-8'))
AddSentence(xmldoc,paranode,sentenceid,sen,wordlist)
sentenceid+=1
sentencelist = []
isparaend=False
paragraphid+=1

finally:
xmldoc.writexml(fpd, addindent=' ', newl='\n', encoding='utf-8')
fps.close()
fpd.close()
return 0

sourcefile = 'E:\\Project\\Python\\Ictclas_test\\test.txt'
destfile = 'E:\\Project\\Python\\Ictclas_test\\test.html'

divide_sentence(sourcefile,destfile)

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： nlpir python 分词

相关文章推荐

新的分享

章节导航