您的位置:首页 > 编程语言 > Python开发

python爬虫实战-爬取美女图片并保存至本地文件夹(xpath)

2019-02-23 16:09 465 查看

python爬虫实战-爬取美女图片并保存至本地文件夹(xpath)

import urllib.parse
import urllib.request
from lxml import etree
import time
import os

# 首页:http://sc.chinaz.com/tupian/xingganmeinvtupian.html
# 分页后:http://sc.chinaz.com/tupian/xingganmeinvtupian_3.html

def handle_request(url,page):
#由于页面不规律,需要增加个判断
# 首页:http://sc.chinaz.com/tupian/xingganmeinvtupian.html
# 分页后:http://sc.chinaz.com/tupian/xingganmeinvtupian_3.html
if page == 1:
url=url.format('')
else:
url=url.format('_' + str(page))
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
request=urllib.request.Request(url=url,headers=headers)
return request

def downoad_image(image_src):
dirpath='xingganmeinv'
if not os.path.exists(dirpath):
os.mkdir(dirpath)
#搞个文件名
filename=os.path.basename(image_src)
filepath=os.path.join(dirpath,filename)
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
request=urllib.request.Request(url=image_src,headers=headers)
response=urllib.request.urlopen(request)
with open(filepath,'wb') as fp:
fp.write(response.read())

#解析内容,并下载图片
def parse_content(content):
tree=etree.HTML(content)
img_list=tree.xpath('//div[@id="container"]/div/div/a/img')
for lt in img_list:
#处理图片懒加载
img_href=lt.xpath('.//@src2')[0].replace('_s.','.')
img_title=lt.xpath('.//@alt')[0]
print('正在下载图片:%s  ......'%img_title)
#方法一:
# downoad_image(img_href)
#方法二:
dirname='meinv'
if not os.path.exists(dirname):
os.mkdir(dirname)
# 后缀
s=os.path.splitext(img_href)[-1]
#图片的名称
filepath=dirname+'/'+img_title+s
urllib.request.urlretrieve(img_href,filepath)

def main():
url='http://sc.chinaz.com/tupian/xingganmeinvtupian{}.html'
start_page=int(input('请输入起始页码:'))
end_page=int(input('请输入结束页码:'))
for page in range(start_page,end_page+1):
print('开始下载第%s页所有图片,请稍等......'%page)
request=handle_request(url,page)
content=urllib.request.urlopen(request).read().decode()
parse_content(content)

if __name__ == '__main__':
main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: