一个python单线程爬虫,爬取表情包,新手shiyon
2018-01-18 12:13
507 查看
import requests
from bs4 import BeautifulSoup
def biaoqingbao(beginPage,endPage):
url = "https://www.doutula.com/article/list/?page="
sess = requests.Session()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36',
'cookie' : '__cfduid=d24b0865c62cf58b642f4f35ffec61ce21515548138; UM_distinctid=160ddb55d7b280-09e1869e5aa032-35447354-1fa400-160ddb55d7c290; XSRF-TOKEN=eyJpdiI6IjZvSWZSVFFyeW50Smdtc2t0ZDFjc3c9PSIsInZhbHVlIjoiUnNGdU5EdzhWQ1JPVFRSalU5aWpzdFdleWRcL3FSWTdwaEtZXC84U3RxcjZXeTV4MWJmNVRaSVZYdHQxWDIxdHpuRnpMRjJBbUJjR2NIN1U3cnNOS1k1Zz09IiwibWFjIjoiOTg2ZjVkNTdkMmM5YjMyZDNiNjRmNzAxMjJiZTc5OGNiZTVlOGMxNmMzMmY0MWY3NzU4MjMzZTM1OWRjZjkxMiJ9; laravel_session=eyJpdiI6ImxrUERWQ0NpSHFxRDNSc2VsaFd0dWc9PSIsInZhbHVlIjoicG9COGJOU1wvT1JqaENJckY1a0xcL2pXVWxIaG5KMEY1SGJuRGlFd1wvWnkwdEZLRXl5QUNxbXpYZEVQN3N6XC96UCt1ZGFlOEEreHk1WWtRV3E1UXBscjVRPT0iLCJtYWMiOiI2OTQ3MWY1MDQ4MjNkZGZmNmJiNTU4OWZlNDkxYTUzNTIzNjg0MzBhZDg1YzE1MWFkOGM2MjkwNmVhOWQ1ZmU0In0%3D; _ga=GA1.2.1818345319.1515548139; _gid=GA1.2.101140113.1515548139; yjs_id=aHR0cHM6Ly93d3cuZG91dHVsYS5jb20vYXJ0aWNsZS9saXN0Lz9wYWdlPTF8MTUxNTU1Njg0MTU2MQ; CNZZDATA1256911977=2123601880-1515544387-%7C1515553089',
# 'referer': 'https: // www.doutula.com / article / list /?page = 1'
}
for page in range(beginPage,endPage + 1):
filename = "第" + str(page) + "页.html"
finalUrl = url + str(page)
html = sess.get(finalUrl,headers = headers,timeout = 500).content.decode()
bs = BeautifulSoup(html, "lxml")
img = bs.find_all(attrs={"class": "lazy image_dtb img-responsive"})
# print(html)
list = []
for src in img:
img_src = src.get('data-original')
if img_src.find('http') != -1:
print(img_src)
list.append(img_src)
writeImg(img_src, headers)
print('爬取完成' + str(len(list)) + "条")
# print(list)
def writeImg(link,headers):
sess = requests.Session()
request = sess.get(link , headers = headers).content
filename = link[-12:]
with open('bqb'+filename,"wb") as f :
f.write(request)
if __name__ == "__main__":
beginPage = int(input("请输入起始页"))
endPage = int(input("请输入结束页"))
biaoqingbao(beginPage,endPage)
from bs4 import BeautifulSoup
def biaoqingbao(beginPage,endPage):
url = "https://www.doutula.com/article/list/?page="
sess = requests.Session()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36',
'cookie' : '__cfduid=d24b0865c62cf58b642f4f35ffec61ce21515548138; UM_distinctid=160ddb55d7b280-09e1869e5aa032-35447354-1fa400-160ddb55d7c290; XSRF-TOKEN=eyJpdiI6IjZvSWZSVFFyeW50Smdtc2t0ZDFjc3c9PSIsInZhbHVlIjoiUnNGdU5EdzhWQ1JPVFRSalU5aWpzdFdleWRcL3FSWTdwaEtZXC84U3RxcjZXeTV4MWJmNVRaSVZYdHQxWDIxdHpuRnpMRjJBbUJjR2NIN1U3cnNOS1k1Zz09IiwibWFjIjoiOTg2ZjVkNTdkMmM5YjMyZDNiNjRmNzAxMjJiZTc5OGNiZTVlOGMxNmMzMmY0MWY3NzU4MjMzZTM1OWRjZjkxMiJ9; laravel_session=eyJpdiI6ImxrUERWQ0NpSHFxRDNSc2VsaFd0dWc9PSIsInZhbHVlIjoicG9COGJOU1wvT1JqaENJckY1a0xcL2pXVWxIaG5KMEY1SGJuRGlFd1wvWnkwdEZLRXl5QUNxbXpYZEVQN3N6XC96UCt1ZGFlOEEreHk1WWtRV3E1UXBscjVRPT0iLCJtYWMiOiI2OTQ3MWY1MDQ4MjNkZGZmNmJiNTU4OWZlNDkxYTUzNTIzNjg0MzBhZDg1YzE1MWFkOGM2MjkwNmVhOWQ1ZmU0In0%3D; _ga=GA1.2.1818345319.1515548139; _gid=GA1.2.101140113.1515548139; yjs_id=aHR0cHM6Ly93d3cuZG91dHVsYS5jb20vYXJ0aWNsZS9saXN0Lz9wYWdlPTF8MTUxNTU1Njg0MTU2MQ; CNZZDATA1256911977=2123601880-1515544387-%7C1515553089',
# 'referer': 'https: // www.doutula.com / article / list /?page = 1'
}
for page in range(beginPage,endPage + 1):
filename = "第" + str(page) + "页.html"
finalUrl = url + str(page)
html = sess.get(finalUrl,headers = headers,timeout = 500).content.decode()
bs = BeautifulSoup(html, "lxml")
img = bs.find_all(attrs={"class": "lazy image_dtb img-responsive"})
# print(html)
list = []
for src in img:
img_src = src.get('data-original')
if img_src.find('http') != -1:
print(img_src)
list.append(img_src)
writeImg(img_src, headers)
print('爬取完成' + str(len(list)) + "条")
# print(list)
def writeImg(link,headers):
sess = requests.Session()
request = sess.get(link , headers = headers).content
filename = link[-12:]
with open('bqb'+filename,"wb") as f :
f.write(request)
if __name__ == "__main__":
beginPage = int(input("请输入起始页"))
endPage = int(input("请输入结束页"))
biaoqingbao(beginPage,endPage)
相关文章推荐
- 一个单线程爬取英文维基百科正文与链接关系的Python爬虫
- 一个简单的python爬虫,以豆瓣妹子“http://www.dbmeizi.com/category/2?p= ”为例
- 用Python写一个小小的爬虫程序
- [Python]网络爬虫(十):一个爬虫的诞生全过程(以山东大学绩点运算为例)
- python爬虫爬取糗百成人图片单线程版本
- 【python新手入门】一个python List 的简单运用 -----班级学生管理系统
- Python编写一个简单的简单的爬虫-下载保存在本地
- 一个新手学习python、pys60的感受
- 基于python的一个大规模爬虫遇到的一些问题总结
- 【python学习笔记】14:开发一个简易的爬虫
- [Python]网络爬虫(六):一个简单的百度贴吧的小爬虫 Python 3.6 改写
- Python爬虫的一个编码问题
- 一个简单的多线程Python爬虫(一)
- [Python]网络爬虫(十):一个爬虫的诞生全过程(以山东大学绩点运算为例)
- 菜鸟成长记-----用python写一个简单的小爬虫
- 一个简单的不用cookie的人人网状态爬取的python爬虫,使用beautifulsoup
- [Python]网络爬虫(六):一个简单的百度贴吧的小爬虫
- 一个Python小白5个小时爬虫经历 【续】