使用QNetworkManager爬取反倒链网站的图片
2016-06-22 23:33
423 查看
#coding=utf-8 __author__ = 'ds' from PyQt4.QtCore import * from PyQt4.QtGui import * from PyQt4.QtNetwork import * from PyQt4.QtWebKit import * import os,sys DEFAULT_DIR = ur'F:\图片\美女' #定制网络管理模块,从网络模块中筛选出定制的数据部分 class NetworkManager(QNetworkAccessManager): def __init__(self, parent = None): super(NetworkManager, self).__init__(parent) self._buffer = {} self._nextID = 1 def createRequest(self, QNetworkAccessManager_Operation, request, QIODevice_device=None): if self.isSaveURL(request.url()): self._buffer[self._nextID] = QByteArray() request.setAttribute(QNetworkRequest.User, self._nextID) else: request.setAttribute(QNetworkRequest.User, 0) self._nextID += 1 reply = super(NetworkManager, self).createRequest(QNetworkAccessManager_Operation, request, QIODevice_device) if reply.url().scheme() == 'http': reply.readyRead.connect(self.onReadyRead) reply.finished.connect(self.onFinish) return reply def isSaveURL(self, url): url = url.toString() rx = QRegExp(u'\\d+/\\d+\\d+\.jpg') if url.indexOf(rx) == -1: return False return True def onReadyRead(self): reply = self.sender() qvar = reply.request().attribute(QNetworkRequest.User, QVariant()) id = qvar.toInt()[0] if not id: return #因为加载顺序是异步的,所以不能使用直接缓存,最后直接在finish里面使用 #因为每次的readReady可能来自不能的文件,通过文件来源进行键值对的缓存 size = reply.size() if id: self._buffer[id].append(reply.peek(size)) def onFinish(self): reply = self.sender() req = reply.request() if not req: print('!!!!碰到了无效的请求 :%s' % (reply.url())) sys.exit(-10) return qvar = req.attribute(QNetworkRequest.User, QVariant()) if qvar.isNull(): print('无效数据:%s' % (reply.url())) return id = qvar.toInt()[0] #id不是0的时候为有效的数据 if not id or id not in self._buffer: return url = reply.url().toString() segments = url.split('/') file_name = unicode(segments[-1]) file_dirname = segments[-3] + os.path.sep + segments[-2] file_dir = os.path.join(DEFAULT_DIR, unicode(file_dirname)) if not os.path.isdir(file_dir): os.makedirs(file_dir) file_fullname = os.path.join(file_dir, file_name) f = QFile(QString(file_fullname)) ok = f.open(QFile.WriteOnly) if not ok: print('打开文件失败:%s' % (file_fullname)) return f.write(self._buffer[id]) f.close() self._buffer.pop(id) print('成功下载文件到:%s 来自:%s' % (file_fullname, url)) #定制QWebPage,在下载完成当前页面后,继续下一页 class WebPage(QWebPage): def __init__(self, parent = None): super(WebPage, self).__init__(parent) self.loadFinished.connect(self.onLoadFinish) self._albumn = [] self._timer = QTimer() #3ms处理一次 self._timer.setInterval(10000) self._timer.timeout.connect(self.onTimeout) #当前的URL地址 self._current = QUrl() def start(self): if len(self._albumn) > 0: curAlbum = self._albumn[0] self._albumn = self._albumn[1:] self.loadURL(curAlbum) def loadURL(self,url): self._timer.stop() self._current = QUrl(url) self.mainFrame().load(self._current) self._timer.start() def setAlbumns(self, albumn): self._albumn = albumn def onTimeout(self): #重新加载 self.loadURL(self._current) print('刷新页面:%s' % (self._current.toString())) def onLoadFinish(self, ok): frame = self.mainFrame() last_url = frame.url() if not ok: print('当前页面加载失败,尝试重新加载:%s' % (last_url,)) #加载失败的话,重新加载 self.onTimeout() return if self._timer.isActive(): print('停止当前的定时器。') self._timer.stop() pages = frame.findFirstElement('#pages') if not pages: print('当前页面不存在下一页 %s' % (last_url,)) sys.exit(-1) next_page = None pages = pages.findAll('a.a1') if len(pages) == 0: print('找不到上-下页标签%s' % (last_url,)) sys.exit(-2) for p in pages: if p.toPlainText() == u'下一页': next_page = p break if not next_page: print('找不到下一页:%s' % (last_url,)) sys.exit(-3) href = next_page.attribute(u'href') print(str(href)) rx = QRegExp(u'/g/\\d+/\\d+') if href.indexOf(rx) == -1: print('已经加载到最后') #继续加载剩余的地址 if len(self._albumn) > 0: self.start() return else: sys.exit(0) last_url.setPath(href) self.loadURL(last_url) print('加载下一页地址:%s' % (last_url.toString(),)) if __name__ == '__main__': reload(sys) sys.setdefaultencoding('utf-8') app = QApplication(sys.argv) #app.setQuitOnLastWindowClosed(True) #PAGES = [ '19019','18812','16751', '13207', '13206', '13205', '13148','11363','18214',] PAGES = [ '19019','18812',] pages = [] for p in PAGES: url = QUrl(u'http://www.zngirls.com/g/%s' % (p,)) pages.append(url) page = WebPage() page.setNetworkAccessManager(NetworkManager()) page.setAlbumns(pages) #wv = QWebView() #wv.setWindowTitle(u'下载高清图片') #wv.setPage(page) #wv.show() page.start() sys.exit(app.exec_())
相关文章推荐
- Python动态类型的学习---引用的理解
- Python3写爬虫(四)多线程实现数据爬取
- 垃圾邮件过滤器 python简单实现
- 下载并遍历 names.txt 文件,输出长度最长的回文人名。
- install and upgrade scrapy
- Scrapy的架构介绍
- Centos6 编译安装Python
- 使用Python生成Excel格式的图片
- 让Python文件也可以当bat文件运行
- [Python]推算数独
- Python中zip()函数用法举例
- Python中map()函数浅析
- Python将excel导入到mysql中
- Python在CAM软件Genesis2000中的应用
- 使用Shiboken为C++和Qt库创建Python绑定
- FREEBASIC 编译可被python调用的dll函数示例
- Python 七步捉虫法