用python写一个爬虫——爬取性感小姐姐
2018-03-08 08:39
519 查看
忍着鼻血写代码
废话不多说 直接上代码
有时间再补充备注
网站地址:http://www.meizitu.com/a/more_1.html
from bs4 import BeautifulSoup
import random,os,requests
headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:58.0) Gecko/20100101",
'Referer': "http://i.meizitu.net"
}
def home_page(num,num2,headers):
list_url = []
for num in range(num,num2+1):
url = "http://www.meizitu.com/a/more_%d.html"%num
req = requests.get(url,headers=headers)
req.encoding = req.apparent_encoding
html = req.text
bf = BeautifulSoup(html,'lxml')
targets_url = bf.find_all(class_="pic")
for each in targets_url:
list_url.append(each.a.get('href'))
return list_url
def deal_page(headers,list_url):
list_url2 = []
for targets_url2 in list_url:
req = requests.get(targets_url2,headers=headers)
req.encoding = "utf-8"
html2 = req.text
bf2 = BeautifulSoup(html2,'lxml')
targets_url3 = bf2.find_all(id="picture")
# print(targets_url3)
list_url2.append(targets_url3)
return list_url2
def download(headers,list_url2):
list_url3 = []
# ================================
print(list_url2)
import re
urls = re.findall(r'http.*?jpg',str(list_url2))
print(urls,len(urls))
for endurl in urls:
filename = (endurl.split('/')[-3]) + (endurl.split('/')[-2]) +(endurl.split('/')[-1])
print(endurl)
print(filename)
req3 = requests.get(endurl, headers=headers)
root = "//Users//apple//Desktop//meizitu//"
path = root + str(random.randrange(10000)) + filename
if not os.path.exists(path):
with open(path, 'wb') as f:
f.write(req3.content)
f.close()
print("下载完成")
if __name__ == '__main__':
num = int(input("请输入要爬取的起始页:"))
num2 = int(input("请输入终止页:"))
a = home_page(num,num2,headers)
b = deal_page(headers, a)
download(headers, b)
废话不多说 直接上代码
有时间再补充备注
网站地址:http://www.meizitu.com/a/more_1.html
from bs4 import BeautifulSoup
import random,os,requests
headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:58.0) Gecko/20100101",
'Referer': "http://i.meizitu.net"
}
def home_page(num,num2,headers):
list_url = []
for num in range(num,num2+1):
url = "http://www.meizitu.com/a/more_%d.html"%num
req = requests.get(url,headers=headers)
req.encoding = req.apparent_encoding
html = req.text
bf = BeautifulSoup(html,'lxml')
targets_url = bf.find_all(class_="pic")
for each in targets_url:
list_url.append(each.a.get('href'))
return list_url
def deal_page(headers,list_url):
list_url2 = []
for targets_url2 in list_url:
req = requests.get(targets_url2,headers=headers)
req.encoding = "utf-8"
html2 = req.text
bf2 = BeautifulSoup(html2,'lxml')
targets_url3 = bf2.find_all(id="picture")
# print(targets_url3)
list_url2.append(targets_url3)
return list_url2
def download(headers,list_url2):
list_url3 = []
# ================================
print(list_url2)
import re
urls = re.findall(r'http.*?jpg',str(list_url2))
print(urls,len(urls))
for endurl in urls:
filename = (endurl.split('/')[-3]) + (endurl.split('/')[-2]) +(endurl.split('/')[-1])
print(endurl)
print(filename)
req3 = requests.get(endurl, headers=headers)
root = "//Users//apple//Desktop//meizitu//"
path = root + str(random.randrange(10000)) + filename
if not os.path.exists(path):
with open(path, 'wb') as f:
f.write(req3.content)
f.close()
print("下载完成")
if __name__ == '__main__':
num = int(input("请输入要爬取的起始页:"))
num2 = int(input("请输入终止页:"))
a = home_page(num,num2,headers)
b = deal_page(headers, a)
download(headers, b)
相关文章推荐
- [Python]网络爬虫(六):一个简单的百度贴吧的小爬虫
- python实现一个简单的爬虫
- 利用python写一个简易的爬虫,基于慕课网对应课程
- Cola:一个分布式爬虫框架 - 系统架构 - Python4cn(news, jobs)
- Python写的一个爬虫程序
- 一个基于python的数据爬虫
- Python 网络爬虫 009 (编程) 通过正则表达式来获取一个网页中的所有的URL链接,并下载这些URL链接的源代码
- [Python]一起来写一个Python爬虫工具类whyspider
- python一个关于贴吧的小爬虫(二)
- 一个Python小白5个小时爬虫经历
- 用python写一个简单的爬虫功能
- python学习之 12306的一个小爬虫
- 一个python单线程爬虫,爬取表情包,新手shiyon
- [python脚本]一个简单的web爬虫(1)
- 用python写的一个简单的爬取湖州天气的爬虫
- 一个用Python编写的股票数据(沪深)爬虫和选股策略测试框架
- python爬虫学习(8) —— 关于4399的一个小Demo
- Python编写一个简单的简单的爬虫-下载保存在本地
- 用Python3实现一个简单的爬虫。