您的位置:首页 > 理论基础 > 计算机网络

基于Python的urllib2模块的多线程网络爬虫程序

2014-07-09 11:09 555 查看
m Queue import Queue
from gzip import GzipFile
from StringIO import StringIO
import time
import socket
class ContentEncodingProcessor(urllib2.BaseHandler):
"""A handler to add gzip capabilities to urllib2 requests """

# add headers to requests
def http_request(self, req):
req.add_header("Accept-Encoding", "gzip, deflate")
return req

# decode
def http_response(self, req, resp):
old_resp = resp

# if(resp.geturl() != req):
#    print 'no'
#   return 1
# gzip
if resp.headers.get("content-encoding") == "gzip":
gz = GzipFile(
fileobj=StringIO(resp.read()),
mode="r"
)
resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
# deflate
if resp.headers.get("content-encoding") == "deflate":
gz = StringIO( deflate(resp.read()) )
resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)  # 'class to add info() and
resp.msg = old_resp.msg
return resp

# deflate support
import zlib
def deflate(data):   # zlib only provides the zlib compress format, not the deflate format;
try:               # so on top of all there's this workaround:
return zlib.decompress(data, -zlib.MAX_WBITS)
except zlib.error:
return zlib.decompress(data)

#(set timeout)
socket.setdefaulttimeout(10)

encoding_support = ContentEncodingProcessor
opener = urllib2.build_opener( encoding_support, urllib2.HTTPHandler)

class Fetcher:
def __init__(self,threads):
self.opener = urllib2.build_opener(urllib2.HTTPHandler)
self.lock = Lock() #线程锁
self.q_req = Queue() #任务队列
self.q_ans = Queue() #完成队列import socket
self.threads = threads
for i in range(threads):
t = Thread(target=self.threadget)
t.setDaemon(True)
t.start()
self.running = 0

def __del__(self): #解构时需等待两个队列完成
time.sleep(0.5)
self.q_req.join()
self.q_ans.join()

def taskleft(self):
return self.q_req.qsize()+self.q_ans.qsize()+self.running

def push(self,req):
self.q_req.put(req)

def pop(self):
return self.q_ans.get()

def threadget(self):
while True:
ans = ''
req = self.q_req.get()
#       print req

with self.lock: #要保证该操作的原子性,进入critical area
self.running += 1

try:
#               ans = self.opener.open(req).read()
#content =  opener.open(req).read()
content = urllib2.urlopen(req).read()
#    print temp.geturl()
#    print req
#    add gzip support from here
ans = str(content)
except Exception, what:
print what
pass

self.q_ans.put((ans,req))
with self.lock:
self.running -= 1
self.q_req.task_done()
time.sleep(0.01) # don't spam

if __name__ == "__main__":
a = [0] * 3600000
links = [ 'http://www.songtaste.com/song/%d/'%i for i in range(1,3600000) ]
f = Fetcher(threads=50)
for url in links:
f.push(url)
while f.taskleft():
the_page,x =f.pop()
# print the_page
try:
npos = the_page.index('chart#fav')
except :
pass
else:
for j in range(npos,1,-1):
if the_page[j] == ',':
k = j
break
sum = 0 ;
t = 1 ;
for j in range(k-1,1,-1):
if  the_page[j] <= '9' and the_page[j] >='0':
sum = sum + (int(the_page[j]) - int('0')) * t
t *= 10;
else :
break
p = int(x[30:-1])
if(p % 10000 <= 5  )
a[p] = sum
if sum != 0:
print p
print sum


View Code
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: