您的位置:首页 > 编程语言 > Python开发

Python3.4网页爬虫,提取图片

2015-07-30 16:24 281 查看
网页图片爬虫:

第一个爬虫抓去bing主页图片,24张

第二个抓取贴吧图片

第三个抓去图虫图片

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# -*- author:miko-*-
# python3抓取bing主页所有背景图片
import urllib.request
import urllib,re,sys,os
def get_bing_backphoto():

if (os.path.exists('img')== False):
os.mkdir('img')
for i in range(0,24):
url = 'http://cn.bing.com/HPImageArchive.aspx?format=js&idx='+str(i)+'&n=1&nc=1361089515117&FORM=HYLH1'
html = urllib.request.urlopen(url).read()
if html == 'null':
print( 'open & read bing error!')
sys.exit(-1)
html = html.decode('utf-8')
#print (html)
reg = re.compile('"url":"(.*?)","urlbase"',re.S)
text = re.findall(reg,html)
#http://s.cn.bing.net/az/hprichbg/rb/LongJi_ZH-CN8658435963_1366x768.jpg
for imgurl in text :
right = imgurl.rindex('/')
name = imgurl.replace(imgurl[:right+1],'')
savepath = 'img/'+ name
print (imgurl)
urllib.request.urlretrieve(imgurl, savepath)
#print (name + ' save success!')
get_bing_backphoto()


#coding=utf-8
import urllib.request
import re
import urllib,re,sys,os
def getHtml(url):

html = urllib.request.urlopen(url).read()
if html == 'null':
print( 'open & read bing error!')
sys.exit(-1)
html=html.decode('utf-8')
return html

def getImg(html):
if (os.path.exists('baidu')== False):
os.mkdir('baidu')
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = imgre.findall(html)
x = 0
for imgurl in imglist:
urllib.request.urlretrieve(imgurl,'baidu/%s.jpg' % x)
x = x + 1
print(imgurl)
#http://imgsrc.baidu.com/forum/pic/item/16391f30e924b89915f86eb06f061d950b7bf677.jpg
html = getHtml("http://tieba.baidu.com/p/2460150866")
getImg(html)
#print (getImg(html))


#-*- encoding: utf-8 -*-
'''
Created on 2015-7-30
@author: Miko
'''

import urllib.request
import urllib,re,sys,os,time
import uuid
#获取二级页面url
def findUrl2(html):
re1 = r'http://tuchong.com/\d+/\d+/|http://\w+(?<!photos).tuchong.com/\d+/'
url2list = re.findall(re1,html)
url2lstfltr = list(set(url2list))
url2lstfltr.sort(key=url2list.index)
#print url2lstfltr
return url2lstfltr
#获取html文本
def getHtml(url):
html = urllib.request.urlopen(url).read().decode('utf-8')#解码为utf-8
return html
#下载图片到本地
def download(html_page , pageNo):
#定义文件夹的名字
x = time.localtime(time.time())
foldername = str(x.__getattribute__("tm_year"))+"-"+str(x.__getattribute__("tm_mon"))+"-"+str(x.__getattribute__("tm_mday"))
re2=r'http://photos.tuchong.com/.+/f/.+\.jpg'
imglist=re.findall(re2,html_page)
print (imglist)
download_img=None
for imgurl in imglist:
picpath = 'D:\\TuChong\\%s\\%s'  % (foldername,str(pageNo))
filename = str(uuid.uuid1())
if not os.path.exists(picpath):
os.makedirs(picpath)
target = picpath+"\\%s.jpg" % filename
print ("The photos location is:"+target)
download_img = urllib.request.urlretrieve(imgurl, target)#将图片下载到指定路径中
time.sleep(1)
print(imgurl)
return download_img

# def callback(blocknum, blocksize, totalsize):
#     '''回调函数
#     @blocknum: 已经下载的数据块
#     @blocksize: 数据块的大小
#     @totalsize: 远程文件的大小
#     '''
#     print str(blocknum),str(blocksize),str(totalsize)
#     if blocknum * blocksize >= totalsize:
#         print '下载完成'
def quitit():
print ("Bye!")
exit(0)

if __name__ == '__main__':
print ('''            *****************************************
**    Welcome to Spider for TUCHONG    **
**      Created on 2015-7-30           **
**      @author: miko                  **
*****************************************''')
pageNo ='10' # raw_input("Input the page number you want to scratch (1-100),please input 'quit' if you want to quit>")
while not pageNo.isdigit() or int(pageNo) > 100 :
if pageNo == 'quit':quitit()
print ("Param is invalid , please try again.")
pageNo = raw_input("Input the page number you want to scratch >")

#针对图虫人像模块来爬取
html = getHtml("http://tuchong.com/tags/%E4%BA%BA%E5%83%8F/?page="+str(pageNo))

detllst = findUrl2(html)
for detail in detllst:
html2 = getHtml(detail)
download(html2,pageNo)
print ("Finished.")
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: