您的位置:首页 > 编程语言 > Python开发

python转换html到pdf文件

2018-01-15 10:14 232 查看
1.安装wkhtmltopdf

Windows平台直接在 http://wkhtmltopdf.org/downloads.html 下载稳定版的 wkhtmltopdf 进行安装,安装完成之后把该程序的执行路径加入到系统环境 $PATH 变量中,否则 pdfkit 找不到 wkhtmltopdf 就出现错误 “No wkhtmltopdf executable found”



2.安装pdfkit

直接pip install pdfkit

pdfkit 是 wkhtmltopdf 的Python封装包

1 import pdfkit
2
3 # 有下面3中途径生产pdf
4
5 pdfkit.from_url('http://google.com', 'out.pdf')
6
7 pdfkit.from_file('test.html', 'out.pdf')
8
9 pdfkit.from_string('Hello!', 'out.pdf')


3.合并pdf,使用PyPDF2

直接pip install PyPDF2

1 from PyPDF2 import PdfFileMerger
2 merger = PdfFileMerger()
3 input1 = open("1.pdf", "rb")
4 input2 = open("2.pdf", "rb")
5 merger.append(input1)
6 merger.append(input2)
7 # 写入到输出pdf文档中
8 output = open("hql_all.pdf", "wb")
9 merger.write(output)


4.综合示例:

1 # coding=utf-8
2 import os
3 import re
4 import time
5 import logging
6 import pdfkit
7 import requests
8 from bs4 import BeautifulSoup
9 from PyPDF2 import PdfFileMerger
10
11 html_template = """
12 <!DOCTYPE html>
13 <html lang="en">
14 <head>
15     <meta charset="UTF-8">
16 </head>
17 <body>
18 {content}
19 </body>
20 </html>
21
22 """
23
24
25 def parse_url_to_html(url, name):
26     """
27     解析URL,返回HTML内容
28     :param url:解析的url
29     :param name: 保存的html文件名
30     :return: html
31     """
32     try:
33         response = requests.get(url)
34         soup = BeautifulSoup(response.content, 'html.parser')
35         # 正文
36         body = soup.find_all(class_="x-wiki-content")[0]
37         # 标题
38         title = soup.find('h4').get_text()
39
40         # 标题加入到正文的最前面,居中显示
41         center_tag = soup.new_tag("center")
42         title_tag = soup.new_tag('h1')
43         title_tag.string = title
44         center_tag.insert(1, title_tag)
45         body.insert(1, center_tag)
46         html = str(body)
47         # body中的img标签的src相对路径的改成绝对路径
48         pattern = "(<img .*?src=\")(.*?)(\")"
49
50         def func(m):
51             if not m.group(3).startswith("http"):
52                 rtn = m.group(1) + "http://www.liaoxuefeng.com" + m.group(2) + m.group(3)
53                 return rtn
54             else:
55                 return m.group(1)+m.group(2)+m.group(3)
56         html = re.compile(pattern).sub(func, html)
57         html = html_template.format(content=html)
58         html = html.encode("utf-8")
59         with open(name, 'wb') as f:
60             f.write(html)
61         return name
62
63     except Exception as e:
64
65         logging.error("解析错误", exc_info=True)
66
67
68 def get_url_list():
69     """
70     获取所有URL目录列表
71     :return:
72     """
73     response = requests.get("http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000")
74     soup = BeautifulSoup(response.content, "html.parser")
75     menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1]
76     urls = []
77     for li in menu_tag.find_all("li"):
78         url = "http://www.liaoxuefeng.com" + li.a.get('href')
79         urls.append(url)
80     return urls
81
82
83 def save_pdf(htmls, file_name):
84     """
85     把所有html文件保存到pdf文件
86     :param htmls:  html文件列表
87     :param file_name: pdf文件名
88     :return:
89     """
90     options = {
91         'page-size': 'Letter',
92         'margin-top': '0.75in',
93         'margin-right': '0.75in',
94         'margin-bottom': '0.75in',
95         'margin-left': '0.75in',
96         'encoding': "UTF-8",
97         'custom-header': [
98             ('Accept-Encoding', 'gzip')
99         ],
100         'cookie': [
101             ('cookie-name1', 'cookie-value1'),
102             ('cookie-name2', 'cookie-value2'),
103         ],
104         'outline-depth': 10,
105     }
106     pdfkit.from_file(htmls, file_name, options=options)
107
108
109 def main():
110     start = time.time()
111     file_name = u"liaoxuefeng_Python3_tutorial"
112     urls = get_url_list()
113     for index, url in enumerate(urls):
114       parse_url_to_html(url, str(index) + ".html")
115     htmls =[]
116     pdfs =[]
117     for i in range(0,124):
118         htmls.append(str(i)+'.html')
119         pdfs.append(file_name+str(i)+'.pdf')
120
121         save_pdf(str(i)+'.html', file_name+str(i)+'.pdf')
122
123         print u"转换完成第"+str(i)+'个html'
124
125     merger = PdfFileMerger()
126     for pdf in pdfs:
127        merger.append(open(pdf,'rb'))
128        print u"合并完成第"+str(i)+'个pdf'+pdf
129
130     output = open(u"廖雪峰Python_all.pdf", "wb")
131     merger.write(output)
132
133     print u"输出PDF成功!"
134
135     for html in htmls:
136         os.remove(html)
137         print u"删除临时文件"+html
138
139     for pdf in pdfs:
140         os.remove(pdf)
141         print u"删除临时文件"+pdf
142
143     total_time = time.time() - start
144     print(u"总共耗时:%f 秒" % total_time)
145
146
147 if __name__ == '__main__':
148     main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: