您的位置:首页 > 编程语言 > Python开发

用python写爬虫,爬取清纯妹子网站

2017-07-14 18:50 267 查看
转载:https://zhuanlan.zhihu.com/p/26395979

# encoding: utf-8
import requests
from lxml import html

def get_page_number(num):
url = "http://www.mmjpg.com/home/" + num;
response = requests.get(url).content;
selector = html.fromstring(response);

urls = [];

for i in selector.xpath("//ul/li/a/@href"):
urls.append(i)
return urls

def get_image_title(url):
response = requests.get(url).content
selector = html.fromstring(response)
image_title = selector.xpath("//h2/text()")[0]
return image_title

def get_image_amount(url):
response = requests.get(url).content
selector = html.fromstring(response)
image_amount = selector.xpath("//div[@class='page']/a[last()-1]/text()")[0]

return image_amount

def get_image_detail_website(url):
response = requests.get(url).content
selector = html.fromstring(response)
image_detail_websites = []
image_amount = selector.xpath("//div[@class='page']/a[last()-1]/text()")[0]

for i in range(int(image_amount)):
image_detail_link = '{}/{}'.format(url, i + 1)
response = requests.get(image_detail_link).content
sel = html.fromstring(response)
image_download_link = sel.xpath("//div[@class='content']/a/img/@src")[0]
image_detail_websites.append(image_download_link)

return image_detail_websites

def download_image(image_title, image_detail_websites):
num = 1;
amount = len(image_detail_websites)
for i in image_detail_websites:
filename = '%s%s.jpg' % (image_title, num)
# print('正在下载图片:%s第%s/%s张,' % (image_title, num, amount))
print(image_title, num, amount)
with open(filename, 'wb') as f:
f.write(requests.get(i).content)
num += 1

# if __name__ == '__main__':
#     page_number = input('请输入需要爬取的页码:')

for link in get_page_number("2"):
print link
download_image(get_image_title(link), get_image_detail_website(link))

# urlss = get_page_number("1");
# print urlss;
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: