您的位置:首页 > 编程语言 > Python开发

Python3抓取煎蛋妹子图

2015-07-24 11:03 645 查看
闲来无事,从煎蛋妹子图抓取所有图片并保存。ps:900页之前图片都看不到了,所以只能抓900页之后的

#coding=utf-8

import httplib2,re,random,os
urls=[]
imgUrls=[]
h = httplib2.Http(".cache")
reg=r'img src="(http.*?gif)" /></p>'
imgre = re.compile(reg)
filepath='/home/arthur/myapps/imgs'
prefix_url='http://jandan.net/ooxx/page-'

def createJDUrlList(size):
#煎蛋妹子图前900页无内容
for i in range(900,size):
urls.append(prefix_url+str(i))

def getImgUrlsFromPageUrl(pageUrl):
i_headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/"+str(random.random())+" Firefox/39.0","Accept": "text/plain"}
response,content = h.request(pageUrl,headers=i_headers)
content=content.decode('utf-8')
pageImgUrls=re.findall(imgre,content)
imgUrls.extend(pageImgUrls)
print('pageImgUrls:'+str(len(pageImgUrls)))

#保存图片到文件,如果图片文件已经存在,则跳过
def getAndSaveImg(imgUrl):
filename=imgUrl.split('/')[-1]
file_path=filepath+'/'+filename
if os.path.exists(file_path):
return
print('save:['+imgUrl+'] to ['+file_path+']')
try:
#经测试煎蛋有防止爬虫功能,所以改变User-Agent,伪装成新请求
i_headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/"+str(random.random())+" Firefox/39.0","Accept": "text/plain"}
response,content = h.request(imgUrl,headers=i_headers)
save2File(fcontent=content,mode='wb',path=file_path)
except Exception as e:
print(e)

def save2File(fcontent,path,mode):
wfile=open(path,mode)
wfile.write(content)
wfile.close()

def getJiandanMM():
#最新页面
createJDUrlList(1474)
getImgUrlsFromPageUrl(urls[-1])
for url in urls:
getImgUrlsFromPageUrl(url)
#将图片url暂存到文件,下载图片出现错误可以直接从文件载入图片url
pre=str(imgUrls)
pre=pre.replace("[","")
pre=pre.replace("]","")+"\n"
save2File(fcontent=pre,mode='w',path=filepath+'/allurls_2.txt')

for imgUrl in imgUrls:
print(imgUrl)
getAndSaveImg(imgUrl)

def getJiandanMMFromFile(filepath):
wfile=open(filepath,'r')
line=wfile.readline()
wfile.close()
urls=line.replace('\n','').replace('\'','').replace(' ','').split(',')

for url in urls:
saveImg(url)
if __name__ == "__main__":
getJiandanMM()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: