python爬虫 分页获取图片并下载
2015-08-14 14:17
806 查看
--刚接触python2天,想快速上手,就写了个爬虫,写完之后,成就感暴增,用起来顺手多了。
download-succeed -> 1_1_1bdbc1d1628a1f0ebd5fc60055ee506e.jpg
download-succeed -> 1_2_01b5b45171979aace617ab79299d7515.jpg
download-succeed -> 1_3_5698c42371add40501a328ef2c753b4d.jpg
download-succeed -> 1_4_f7219087ce29c474a777867b8e4755ed.jpg
download-succeed -> 1_5_58bf8172ea8bbc4cee0a0f8240f2b289.jpg
download-succeed -> 1_6_b4700f4bd96f90039ed662ebbf6c1f7c.jpg
download-succeed -> 1_7_8a637b3362acddac4671d9ad02e4a93f.jpg
download-succeed -> 1_8_f28e22908b68d6fbe42a15c4fcd62613.jpg
download-succeed -> 1_9_03806c0b3d33cfc3a3eb4ea3bbe8ca9e.jpg
download-succeed -> 1_10_cf26fb246e9b57c06e328af94e60450b.jpg
download-succeed -> 1_11_7563610f39bd29b8381201b95eed2624.jpg
download-succeed -> 1_12_8ccaccede13d0f377d0d8822243f3b6a.jpg
download-succeed -> 1_13_c95a0207db67a334be4812cec25d7023.jpg
download-succeed -> 1_14_71ce070aef91660e8dad60a5919ec505.jpg
download-succeed -> 1_15_9a647a8f449cdb3208a561b4c9fe2ce6.jpg
download-succeed -> 1_16_45d9992e3d5080cf14ef73da14066283.jpg
download-succeed -> 1_17_7bd84ee7d6f5cb911a3b1dbc6e0775c4.jpg
download-succeed -> 1_18_8397b9d434a187444c389ebff48bcfb5.jpg
download-succeed -> 2_1_f14e658f2464769756039e1ff18d5693.jpg
download-succeed -> 2_2_ad051a669008969800ccd324de056465.jpg
download-succeed -> 2_3_6190ffe369199b95274100996b02359a.jpg
download-succeed -> 2_4_f14dce28d960941781a12a57123076df.jpg
download-succeed -> 2_5_c7fb3b6f700339e9f3c9ee02474211eb.jpg
download-succeed -> 2_6_327f1a33b8c5989a2d014ea41565caef.jpg
...
1.源码
#coding=utf-8 import urllib import re class Page(): __slots__ = ('url', 'regex', 'arg' ) def __init__(this ,url ,regex ,arg ): if not arg : arg['download'] = False arg['write'] = False arg['outpath'] = '' this.filter = Filter(url,{ 'regex' : regex, 'custom' : arg['custom'] if arg.has_key('custom') else '' }) this.url = url; this.outpath = arg['outpath'] if arg.has_key('outpath') else '' this.download =arg['download'] if arg.has_key('download') else False this.write = arg['write'] if arg.has_key('write') else False this.pagin = arg['pagin'] if arg.has_key('pagin') else False def start(this ,*prefix): _pagin = this.pagin; _getHtml = this.getHtml;_prefix = '1'; if len(prefix) >= 1 : _prefix = prefix[0]; _getHtml(this.url ,_prefix); if _pagin != False : _start = _pagin['start']; _end = _pagin['end']; _rule = _pagin['rule']; while _start <= _end : _getHtml(_rule.replace('{page}',str(_start)) ,str(_start)); _start += 1 def down(this ,url ,prefix): try: filename = str(prefix) + '_' + url[url.rfind("/")+1:] urllib.urlretrieve(url, this.outpath + filename); print 'download-succeed\t->',filename except: print 'download->failed' def downs(this ,arr ,prefix): for x in arr: this.down(x ,prefix); def writeFile(this ,arr): _file = open(this.outpath + 'list.txt', 'a+') try: _file.writelines('\n\n'+'\n'.join(arr)) finally: _file.close() def getHtml(this ,url ,prefix): try: _p = urllib.urlopen(url); html = _p.read(); _p.close() html = unicode(html, "gb2312").encode("utf8") arr = this.filter.execute(html ,prefix) if this.download == True : this.downs(arr ,prefix); if this.write == True : this.writeFile(arr); except: print "catch finally exception." class Filter(): def __init__(this ,url ,arg): this.arg = arg this.url = url def _getDomain(this): url = this.url.split('/') return url[0]+'//'+url[2] def _getRealUrl(this ,domain, url): if url[0] == '/' : return domain + url; if 'http://' in url : return url #==============需要处理的字符串链接... return domain + '/' +url; def execute(this ,html ,prefix): _arg = this.arg; arr=[]; getRealUrl = this._getRealUrl; its = re.finditer( _arg['regex'] ,html) for match in its: arr.append(getRealUrl(this._getDomain() ,match.groups()[0])) if _arg.has_key('custom') == True and _arg['custom'] != '' : _arg['custom'](arr ,prefix); return arr def paginList(arr ,prefix): num = 1; for x in arr: Page(x ,'<p><img\ssrc="(.*?)"\salt.*?</p>' ,{ 'download' : True, 'outpath' : 'f:/temp/' }).start(prefix+'_'+str(num)); num+=1 Page("http://www.netbian.com/fengjing/" ,'<li><a\shref="(.*?)"\s.*?\salt="(.*?)"\s.*?</li>' ,{ 'custom' : paginList, 'pagin' : { 'start' : 2, 'end' : 10, 'rule' : 'http://www.netbian.com/fengjing/index_{page}.htm' } }).start()
2.运行如下
$ python getjpg.pydownload-succeed -> 1_1_1bdbc1d1628a1f0ebd5fc60055ee506e.jpg
download-succeed -> 1_2_01b5b45171979aace617ab79299d7515.jpg
download-succeed -> 1_3_5698c42371add40501a328ef2c753b4d.jpg
download-succeed -> 1_4_f7219087ce29c474a777867b8e4755ed.jpg
download-succeed -> 1_5_58bf8172ea8bbc4cee0a0f8240f2b289.jpg
download-succeed -> 1_6_b4700f4bd96f90039ed662ebbf6c1f7c.jpg
download-succeed -> 1_7_8a637b3362acddac4671d9ad02e4a93f.jpg
download-succeed -> 1_8_f28e22908b68d6fbe42a15c4fcd62613.jpg
download-succeed -> 1_9_03806c0b3d33cfc3a3eb4ea3bbe8ca9e.jpg
download-succeed -> 1_10_cf26fb246e9b57c06e328af94e60450b.jpg
download-succeed -> 1_11_7563610f39bd29b8381201b95eed2624.jpg
download-succeed -> 1_12_8ccaccede13d0f377d0d8822243f3b6a.jpg
download-succeed -> 1_13_c95a0207db67a334be4812cec25d7023.jpg
download-succeed -> 1_14_71ce070aef91660e8dad60a5919ec505.jpg
download-succeed -> 1_15_9a647a8f449cdb3208a561b4c9fe2ce6.jpg
download-succeed -> 1_16_45d9992e3d5080cf14ef73da14066283.jpg
download-succeed -> 1_17_7bd84ee7d6f5cb911a3b1dbc6e0775c4.jpg
download-succeed -> 1_18_8397b9d434a187444c389ebff48bcfb5.jpg
download-succeed -> 2_1_f14e658f2464769756039e1ff18d5693.jpg
download-succeed -> 2_2_ad051a669008969800ccd324de056465.jpg
download-succeed -> 2_3_6190ffe369199b95274100996b02359a.jpg
download-succeed -> 2_4_f14dce28d960941781a12a57123076df.jpg
download-succeed -> 2_5_c7fb3b6f700339e9f3c9ee02474211eb.jpg
download-succeed -> 2_6_327f1a33b8c5989a2d014ea41565caef.jpg
...
3.结果如下
相关文章推荐
- 用python合并N个不同字符集编码的sql文件的实践
- python2.7搭建
- python如何生成重复单一值的序列
- 怎样给python编译成dll文件
- Python yield分析
- python打开网页获取网页内容方法总结
- 零基础学python-2.19 定义函数、调用函数与默认参数
- 安装mysql-python
- 总结python中的乱码问题
- python调用wcf服务 实现网站对客户端的调用
- 零基础学python-2.21 回到我们的游戏 加入类和函数
- 零基础学python-2.21 回到我们的游戏 加入类和函数
- Python中的super()方法使用简介
- 简单实现Socks5代理(Python&C#)
- 04 Python 对象 - 《Python 核心编程》
- 零基础学python-2.19 定义函数、调用函数与默认参数
- Python: 搜狗分类语料库gb2312编码为utf-8
- python使用thrift2协议connect hbase
- 小白学python(8.14)
- python 定时器