您的位置:首页 > 编程语言 > Python开发

一个python单线程爬虫,爬取表情包,新手shiyon

2018-01-18 12:13 507 查看
import requests
from bs4 import BeautifulSoup
def biaoqingbao(beginPage,endPage):
    url = "https://www.doutula.com/article/list/?page="
    sess = requests.Session()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36',
        'cookie' : '__cfduid=d24b0865c62cf58b642f4f35ffec61ce21515548138; UM_distinctid=160ddb55d7b280-09e1869e5aa032-35447354-1fa400-160ddb55d7c290; XSRF-TOKEN=eyJpdiI6IjZvSWZSVFFyeW50Smdtc2t0ZDFjc3c9PSIsInZhbHVlIjoiUnNGdU5EdzhWQ1JPVFRSalU5aWpzdFdleWRcL3FSWTdwaEtZXC84U3RxcjZXeTV4MWJmNVRaSVZYdHQxWDIxdHpuRnpMRjJBbUJjR2NIN1U3cnNOS1k1Zz09IiwibWFjIjoiOTg2ZjVkNTdkMmM5YjMyZDNiNjRmNzAxMjJiZTc5OGNiZTVlOGMxNmMzMmY0MWY3NzU4MjMzZTM1OWRjZjkxMiJ9; laravel_session=eyJpdiI6ImxrUERWQ0NpSHFxRDNSc2VsaFd0dWc9PSIsInZhbHVlIjoicG9COGJOU1wvT1JqaENJckY1a0xcL2pXVWxIaG5KMEY1SGJuRGlFd1wvWnkwdEZLRXl5QUNxbXpYZEVQN3N6XC96UCt1ZGFlOEEreHk1WWtRV3E1UXBscjVRPT0iLCJtYWMiOiI2OTQ3MWY1MDQ4MjNkZGZmNmJiNTU4OWZlNDkxYTUzNTIzNjg0MzBhZDg1YzE1MWFkOGM2MjkwNmVhOWQ1ZmU0In0%3D; _ga=GA1.2.1818345319.1515548139; _gid=GA1.2.101140113.1515548139; yjs_id=aHR0cHM6Ly93d3cuZG91dHVsYS5jb20vYXJ0aWNsZS9saXN0Lz9wYWdlPTF8MTUxNTU1Njg0MTU2MQ; CNZZDATA1256911977=2123601880-1515544387-%7C1515553089',
        # 'referer': 'https: // www.doutula.com / article / list /?page = 1'
    }
    for page in range(beginPage,endPage + 1):
        filename = "第" + str(page) + "页.html"
        finalUrl = url + str(page)
        html = sess.get(finalUrl,headers = headers,timeout = 500).content.decode()
        bs = BeautifulSoup(html, "lxml")
        img = bs.find_all(attrs={"class": "lazy image_dtb img-responsive"})
        # print(html)
        list = []
        for src in img:
            img_src = src.get('data-original')
            if img_src.find('http') != -1:
                print(img_src)
                list.append(img_src)
                writeImg(img_src, headers)
        print('爬取完成' + str(len(list)) + "条")

        # print(list)
def writeImg(link,headers):
    sess = requests.Session()
    request = sess.get(link , headers = headers).content
    filename = link[-12:]
    with open('bqb'+filename,"wb") as f :
        f.write(request)

if __name__ == "__main__":
    beginPage = int(input("请输入起始页"))
    endPage = int(input("请输入结束页"))
    biaoqingbao(beginPage,endPage)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python