您的位置:首页 > Web前端 > HTML

Epub 转 txt

2012-08-31 19:34 260 查看
# -*- coding: cp936 -*-
import sys,re,zipfile,HTMLParser,os

class GetContent(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)    #HTMLParser不是new class,无法使用super
self.content = ""

def handle_data(self, data):
self.content += data

re_digits = re.compile(r'(\d+)')
def embedded_numbers(s):
pieces = re_digits.split(s)
pieces[1::2] = map(int, pieces[1::2])
return pieces

def sort_with_embedded_numbers(zipinfo_list):
aux = [(embedded_numbers(zipinfo.filename), zipinfo) \
for zipinfo in zipinfo_list]
aux.sort()
return [zipinfo for _, zipinfo in aux]

Files = os.listdir(os.getcwd())
Files = [f for f in Files if ".epub" in f and f.replace(".epub",'.txt') not in Files]

for fname in Files:
fh = zipfile.ZipFile(fname)
html_list = [ zip_info
for zip_info in fh.filelist
if zip_info.filename.endswith("html") or zip_info.filename.endswith("htm")
]
html_list = sort_with_embedded_numbers(html_list)
content_obj = GetContent()
for html in html_list:
content_obj.feed(fh.read(html))
output_filename = fname.replace(".epub", ".txt")
out_fh = open(output_filename, 'w')
out_fh.write(content_obj.content)
out_fh.close()
print fname," done!"
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  numbers html list output class