您的位置：首页 > 编程语言 > Python开发
python抓取网站88titienmae88中的“图片区”所有图片

2018-01-12 11:09 459 查看
#-*-coding:utf-8-*-
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup
import re
import os

'''
抓取网站http://jyghf.com/中的“图片区”的的所有图片
关于这个网站，大家不要太有纠结的情绪，作为一个泱泱大国的男士，第一个爬虫，必须要有作为，作为啊！！！
'''

'''
第一步，进入到图片类中：根据http://jyghf.com/的html分析：
在id='top_box'的div中，第一个class='menu'的div包含着所有的“图片区”分类。
这些分类的url都是以“/p”开头，如：/p01/index.html，全路径： http://jyghf.com/p01/index.html '''

'''
第二步，进入到图片文件夹中：根据http://jyghf.com/p01/index.html的html分析
在class="typelist"的div中，图片路径都在“<li>”标签中，这些图片路径都是以“/htm/”开头
如：“/htm/2017/12/13/p01/393067.html”，全路径：“http://jyghf.com/p01/index.html/htm/2017/12/13/p01/393067.html”
'''

'''
第三步，获取图片的下载路径：根据html分析，图片路径，都在id="view1"的div中的<img>标签的“src”属性下。
'''

# 第一步，进入到图片类中
def getPicTypeLink():
html=urlopen("http://jyghf.com/")
bshtml=BeautifulSoup(html,"html.parser")
picTypes=bshtml.find("div",{"id":"top_box"}).find("div",{"class":"menu"})\
.findAll("a",href=re.compile("^(/p)"))
# html.close()
return picTypes

# 第二步，进入到图片文件夹中
def getPicFileLink(typeLink):
html=urlopen("http://jyghf.com/{0}".format(typeLink))
bshtml=BeautifulSoup(html,"html.parser")
picfiles=bshtml.find("div",{"class":"typelist"}).findAll("a",href=re.compile("^(/htm/)"))
# html.close()
return picfiles

# 第三步，获取图片的下载路径
def getPicSrcLink(picfilelink):
html=urlopen("http://jyghf.com/{0}".format(picfilelink))
bshtml=BeautifulSoup(html,"html.parser")
srcLinks=bshtml.find("div",{"id":"view1"}).findAll("img",src=re.compile("^(http://)"))
# html.close()
return srcLinks

#辅助：根据图片类型+第几页+图片文件夹，创建文件目录
def getDownloadPath(typename,pageid,filename,downLoadDirectory="E:\downloaded"):
path="{0}/{1}/第{2}页/{3}/".format(downLoadDirectory,typename,pageid,filename)
directory = os.path.dirname(path)
if not os.path.exists(directory):
os.makedirs(directory)
return path

#下载图片：根据图片路径的列表（list），下载每个路径的图片

def downloadPic(picSrcLinks,typename,pageid,filename):
pid = 1
for picsrc in picSrcLinks:
downloadurl = picsrc.attrs["src"]
print("第{0}张图片".format(pid))
urlretrieve(downloadurl, "{0}/{1}.jpg".format(getDownloadPath(typename,pageid,filename), pid))
pid += 1

#主体程序
def download(link):
typeLink=link.attrs["href"]
typename=link.get_text()
print(typename)
picFileLinks=getPicFileLink(typeLink)
pageid=1
fileid=1
for picfile in picFileLinks:
if len(picFileLinks)>0:
picFileLink=picfile.attrs["href"]
filename=picfile.get_text()
print("第{0}页:{1}".format(pageid,filename))
fileid+=1
picSrcLinks=getPicSrcLink(picFileLink)
downloadPic(picSrcLinks,typename,pageid,filename)

#获取“下一页”的链接
html = urlopen("http://jyghf.com/{0}".format(typeLink))
bshtml = BeautifulSoup(html, "html.parser")
nextpage = bshtml.find("div", {"id": "page"}).find("a", title="下一页")
while len(nextpage)>0:
pageid+=1
download(nextpage)

#######开始########
picTypeLinks=getPicTypeLink()
for pictypelink in picTypeLinks:
download(pictypelink)
内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理
标签： python爬虫爬取图片爬取网站图片
相关文章推荐
新的分享
章节导航