您的位置:首页 > 编程语言 > Python开发

python实现word图片文字分离

2013-11-07 16:30 751 查看
需要先装pywin32-218.win-amd64-py3.3
#coding:utf-8
from win32com import client as wc
import os
import glob
word = wc.Dispatch('Word.Application')

def wordsToHtml(dir):
#得到要处理的word后缀为doc文件列表
filelist1 = glob.glob(dir+'\*.doc')
#print (filelist1)
for wardfullName in filelist1:
doc = word.Documents.Open(wardfullName)
htmlfullName = wardfullName[:-3]+'html'
txtfullName = wardfullName[:-3]+'txt'

print('正在处理图片----------'+htmlfullName)
print('正在处理文字----------'+txtfullName)

doc.SaveAs(htmlfullName, 10)
doc.SaveAs(txtfullName,5)

os.remove(htmlfullName)
print('正在删除html文件----------'+htmlfullName)
doc.Close()
#得到要处理的word后缀为docx文件列表
filelist2 = glob.glob(dir+'\*.docx')
#print (filelist2)
for wardfullName in filelist2:
doc = word.Documents.Open(wardfullName)
htmlfullName = wardfullName[:-4]+'html'
txtfullName = wardfullName[:-4]+'txt'

print('正在处理图片----------'+htmlfullName)
print('正在处理文字----------'+txtfullName)

doc.SaveAs(htmlfullName, 10)
doc.SaveAs(txtfullName,5)

os.remove(htmlfullName)
print('正在删除html文件----------'+htmlfullName)
doc.Close()
word.Quit()

if __name__ == '__main__':
ddir = r'F:\python'
wordsToHtml(ddir)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: