您的位置:首页 > 其它

实现任意贴吧的爬虫,保存网页到本地

2019-01-20 13:27 337 查看
[code]# coding=utf-8
import requests
class TiebaSpider:
def __init__(self, tieba_name):
self.tieba_name = tieba_name
self.url_temp = "https://tieba.baidu.com/f?kw="+tieba_name+"&pn={}"
self.headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
def get_url_list(self):
#         url_list = []
#         for i in range(1):
#             url = self.url_temp.format(i*50)
#             url_list.append(url)
#         return url_list
# 改成列表推倒式
return [self.url_temp.format(i*50) for i in range(1)]
def parse_url(self, url):
print(url)
response = requests.get(url, headers = self.headers)
return response.content.decode()
def save_html_str(self,html_str, page_num):
path_name = "{}第{}页.html".format(self.tieba_name,page_num)
with open(path_name, "w",encoding="utf8") as f:
f.write(html_str)
def run(self):
#1.构造url_list的列表
url_list = self.get_url_list()
#         print(url_list)
#2.遍历url_list发送请求,获取响应
for url in url_list:
html_str = self.parse_url(url)
#             print(html_str)
#3.保存响应
page_num = url_list.index(url)+1
self.save_html_str(html_str, page_num)
print("save_done")
if __name__ == "__main__":
tieba = TiebaSpider("李毅")
tieba.run()

https://tieba.baidu.com/f?kw=李毅&pn=0
save_done

列表推倒式  不能够写else
In [1]: ["a" for i in range(3) if i%2==0]
Out[1]: ['a', 'a']

In [2]: ["a" for i in range(3) if i%2==0 else sa]
File "<ipython-input-2-cf2fd1fb6f20>", line 1
["a" for i in range(3) if i%2==0 else sa]
^
SyntaxError: invalid syntax

 

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: