您的位置:首页 > 其它

爬虫实例抓取并download with Beautifulsoap

2016-03-06 19:57 357 查看
from bs4 import BeautifulSoup

import urllib2

import urllib, os, re, time, sys

#import socket

def build_request(link):

#             user_agent = 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'

values = {'name' : 'Michael Foord',

'location' : 'Northampton',

'language' : 'Python' }

headers = { 'User-Agent' : 'Custom User-Agent' }

data=urllib.urlencode(values)

req = urllib2.Request(link, data, headers)

req.add_unredirected_header('User-Agent', 'Custom User-Agent')

return req

def build_urllib2(link):

print time.strftime('%Y-%m-%d_%H:%M:%S',time.localtime(time.time()))

#set timeout

urllib2.socket.setdefaulttimeout(60)

#set proxy

null_proxy_handler = urllib2.ProxyHandler({})
#null_proxy_handler = urllib2.ProxyHandler({})

opener = urllib2.build_opener(null_proxy_handler)

urllib2.install_opener(opener)

print 'after install opener'
print time.strftime('%Y-%m-%d_%H:%M:%S',time.localtime(time.time()))

def get_img(link, path):
print 'start to download '+ link

p = re.compile(r'[/\\:\*\"?|<>]+')

q = re.compile(r'[\.]+')

content=[]

try:

req= build_request(link)

content = urllib2.urlopen(req)

#                             content = opener.open(req)

print 'start to create soup'
soup = BeautifulSoup(content)

my_img = soup.find_all('img')

print 'ccccccccccc'
if my_img ==[]:

print 'no pic there'
sys.exit(0)

print 'ok, start to download'
for img in my_img:

img_link = img.get('src')

#print img_link

filename = img_link.split("/")[-1]

if not q.search(filename):

#add postfix to the filename if it doesn't have

filename = filename+'.jpg'
file_path = os.path.join(path,filename)

if os.path.exists(file_path):

continue
                                                if p.search(filename):

print 'continue'
continue
                                                print 'downloading '+filename

try:

urllib.urlretrieve(img_link,file_path, None)

except:

print 'T_T, Failed to download '+ filename

continue
                except urllib2.HTTPError, e:

print e.code

print e.msg

print e.headers

print e.fp.read()

weblink = "http://club.history.sina.com.cn/thread-5534627-1-1.html"

mypath = "G:\\python\\test\\"
build_urllib2(weblink)

get_img(weblink, mypath)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: