爬虫实例抓取并download with Beautifulsoap
2016-03-06 19:57
357 查看
from bs4 import BeautifulSoup import urllib2 import urllib, os, re, time, sys #import socket def build_request(link): # user_agent = 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11' values = {'name' : 'Michael Foord', 'location' : 'Northampton', 'language' : 'Python' } headers = { 'User-Agent' : 'Custom User-Agent' } data=urllib.urlencode(values) req = urllib2.Request(link, data, headers) req.add_unredirected_header('User-Agent', 'Custom User-Agent') return req def build_urllib2(link): print time.strftime('%Y-%m-%d_%H:%M:%S',time.localtime(time.time())) #set timeout urllib2.socket.setdefaulttimeout(60) #set proxy null_proxy_handler = urllib2.ProxyHandler({}) #null_proxy_handler = urllib2.ProxyHandler({}) opener = urllib2.build_opener(null_proxy_handler) urllib2.install_opener(opener) print 'after install opener' print time.strftime('%Y-%m-%d_%H:%M:%S',time.localtime(time.time())) def get_img(link, path): print 'start to download '+ link p = re.compile(r'[/\\:\*\"?|<>]+') q = re.compile(r'[\.]+') content=[] try: req= build_request(link) content = urllib2.urlopen(req) # content = opener.open(req) print 'start to create soup' soup = BeautifulSoup(content) my_img = soup.find_all('img') print 'ccccccccccc' if my_img ==[]: print 'no pic there' sys.exit(0) print 'ok, start to download' for img in my_img: img_link = img.get('src') #print img_link filename = img_link.split("/")[-1] if not q.search(filename): #add postfix to the filename if it doesn't have filename = filename+'.jpg' file_path = os.path.join(path,filename) if os.path.exists(file_path): continue if p.search(filename): print 'continue' continue print 'downloading '+filename try: urllib.urlretrieve(img_link,file_path, None) except: print 'T_T, Failed to download '+ filename continue except urllib2.HTTPError, e: print e.code print e.msg print e.headers print e.fp.read() weblink = "http://club.history.sina.com.cn/thread-5534627-1-1.html" mypath = "G:\\python\\test\\" build_urllib2(weblink) get_img(weblink, mypath)
相关文章推荐
- C# 笔记(怕电脑抽疯,存起来)
- dp学习
- App上架流程
- Java 引用类解析
- OpenCV中ROI 总结
- 配置 maven 编译的 JDK 版本
- stm32 hal i2c 库读写sd3088时钟
- 软件测试 homework2
- 多分类问题中,实现不同分类区域颜色填充的MATLAB代码(demo:Random Forest)
- hbase 读写过程
- asp.net 通过web.config 文件设置网站的mime类型
- 买了个新键盘,雷柏V56,也没有个说明书,只好自己写个了。
- bash特性
- BZOJ3038上帝造题的七分钟2
- 关于网络请求返回数据是nil的情况
- web前端—沉静、务实
- 查找算法7
- 雪花飘落 - 定时器(NSTimer/CADisplayLink)
- Linux内核分析实验二:mykernel实验指导(操作系统是如何工作的)
- 抓取网页数据C#文件