您的位置:首页 > 运维架构 > 网站架构

使用QNetworkManager爬取反倒链网站的图片

2016-06-22 23:33 423 查看
#coding=utf-8

__author__ = 'ds'

from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtNetwork import *
from PyQt4.QtWebKit import *
import os,sys

DEFAULT_DIR = ur'F:\图片\美女'

#定制网络管理模块,从网络模块中筛选出定制的数据部分
class NetworkManager(QNetworkAccessManager):
def __init__(self, parent = None):
super(NetworkManager, self).__init__(parent)
self._buffer = {}
self._nextID = 1

def createRequest(self, QNetworkAccessManager_Operation, request, QIODevice_device=None):
if self.isSaveURL(request.url()):
self._buffer[self._nextID] = QByteArray()
request.setAttribute(QNetworkRequest.User, self._nextID)
else:
request.setAttribute(QNetworkRequest.User, 0)
self._nextID += 1

reply = super(NetworkManager, self).createRequest(QNetworkAccessManager_Operation, request, QIODevice_device)

if reply.url().scheme() == 'http':
reply.readyRead.connect(self.onReadyRead)
reply.finished.connect(self.onFinish)

return reply

def isSaveURL(self, url):
url = url.toString()
rx = QRegExp(u'\\d+/\\d+\\d+\.jpg')
if url.indexOf(rx) == -1:
return False

return True

def onReadyRead(self):
reply = self.sender()
qvar = reply.request().attribute(QNetworkRequest.User, QVariant())
id = qvar.toInt()[0]
if not id:
return

#因为加载顺序是异步的,所以不能使用直接缓存,最后直接在finish里面使用
#因为每次的readReady可能来自不能的文件,通过文件来源进行键值对的缓存
size = reply.size()
if id:
self._buffer[id].append(reply.peek(size))

def onFinish(self):
reply = self.sender()
req = reply.request()
if not req:
print('!!!!碰到了无效的请求 :%s' % (reply.url()))
sys.exit(-10)
return

qvar = req.attribute(QNetworkRequest.User, QVariant())
if qvar.isNull():
print('无效数据:%s' % (reply.url()))
return

id = qvar.toInt()[0]
#id不是0的时候为有效的数据
if not id or id not in self._buffer:
return

url = reply.url().toString()
segments = url.split('/')
file_name = unicode(segments[-1])
file_dirname = segments[-3] + os.path.sep + segments[-2]
file_dir = os.path.join(DEFAULT_DIR, unicode(file_dirname))
if not os.path.isdir(file_dir):
os.makedirs(file_dir)

file_fullname = os.path.join(file_dir, file_name)
f = QFile(QString(file_fullname))
ok = f.open(QFile.WriteOnly)
if not ok:
print('打开文件失败:%s' % (file_fullname))
return

f.write(self._buffer[id])
f.close()
self._buffer.pop(id)
print('成功下载文件到:%s 来自:%s' % (file_fullname, url))

#定制QWebPage,在下载完成当前页面后,继续下一页
class WebPage(QWebPage):
def __init__(self, parent = None):
super(WebPage, self).__init__(parent)
self.loadFinished.connect(self.onLoadFinish)
self._albumn = []
self._timer = QTimer()
#3ms处理一次
self._timer.setInterval(10000)
self._timer.timeout.connect(self.onTimeout)
#当前的URL地址
self._current = QUrl()

def start(self):
if len(self._albumn) > 0:
curAlbum = self._albumn[0]
self._albumn = self._albumn[1:]
self.loadURL(curAlbum)

def loadURL(self,url):
self._timer.stop()
self._current = QUrl(url)
self.mainFrame().load(self._current)
self._timer.start()

def setAlbumns(self, albumn):
self._albumn = albumn

def onTimeout(self):
#重新加载
self.loadURL(self._current)
print('刷新页面:%s' % (self._current.toString()))

def onLoadFinish(self, ok):
frame = self.mainFrame()
last_url = frame.url()
if not ok:
print('当前页面加载失败,尝试重新加载:%s' % (last_url,))
#加载失败的话,重新加载
self.onTimeout()
return

if self._timer.isActive():
print('停止当前的定时器。')
self._timer.stop()

pages = frame.findFirstElement('#pages')
if not pages:
print('当前页面不存在下一页 %s' % (last_url,))
sys.exit(-1)

next_page = None
pages = pages.findAll('a.a1')
if len(pages) == 0:
print('找不到上-下页标签%s' % (last_url,))
sys.exit(-2)

for p in pages:
if p.toPlainText() == u'下一页':
next_page = p
break

if not next_page:
print('找不到下一页:%s' % (last_url,))
sys.exit(-3)

href = next_page.attribute(u'href')
print(str(href))
rx = QRegExp(u'/g/\\d+/\\d+')
if href.indexOf(rx) == -1:
print('已经加载到最后')
#继续加载剩余的地址
if len(self._albumn) > 0:
self.start()
return
else:
sys.exit(0)

last_url.setPath(href)
self.loadURL(last_url)
print('加载下一页地址:%s' % (last_url.toString(),))

if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf-8')
app = QApplication(sys.argv)
#app.setQuitOnLastWindowClosed(True)

#PAGES = [ '19019','18812','16751', '13207', '13206', '13205', '13148','11363','18214',]
PAGES = [ '19019','18812',]
pages = []
for p in PAGES:
url = QUrl(u'http://www.zngirls.com/g/%s' % (p,))
pages.append(url)

page = WebPage()
page.setNetworkAccessManager(NetworkManager())
page.setAlbumns(pages)

#wv = QWebView()
#wv.setWindowTitle(u'下载高清图片')
#wv.setPage(page)
#wv.show()
page.start()

sys.exit(app.exec_())
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python pyqt