您的位置:首页 > 编程语言 > Python开发

python 百度贴吧爬虫(下载图片)

2014-09-24 14:43 656 查看
业余时用python写的百度贴吧爬虫程序,算是对学习python程序得一个练习。

本程序可以针对给定的贴吧链接,把帖子楼主的发言或者图片爬取出来,目前主要功能为下载所有楼主发的图片。爬取楼主发言的功能仅支持屏幕输出,没有保存到本地文件,有兴趣的朋友可以进行补充。仅供学习,转载请标明出处。

tieba_spider.py

#coding:utf-8
import urllib2,re,time,threading
import DownQueue

user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36' #模拟浏览器访问
url='http://tieba.baidu.com/p/3271638607?see_lz=1&pn=' #贴吧地址,只看楼主
header={'User-Agent' : user_agent}

g_worker=DownQueue.down() #下载器

class Tieba_Spider(threading.Thread):
def __init__(self,url,type):
threading.Thread.__init__(self)
self.url=url
self.type=type
self.num=0

def run(self):
self.start_spider()

def get_info(self):
try:
req=urllib2.Request(self.url,headers=header)
response=urllib2.urlopen(req)
htm=response.read().decode('gbk')
self.num=self.get_page_num(htm)
print 'It has %d page' % self.num
self.title=self.get_title(htm)
print 'It\'s title is %s'%self.title

except urllib2.URLError,e:
if hasattr(e,'code'):
print 'Error code :',e.code
if hasattr(e,'reason'):
print 'Reason :',e.reason

def start_spider(self):
global g_worker
self.get_info()

for i in range(1,self.num+1,1):
print 'start : ',i
try:
req=urllib2.Request(self.url+str(i),headers=header)
response=urllib2.urlopen(req)
htm=response.read().decode('gbk')
if self.type==0:
self.page_deal(htm)
elif self.type==1:
self.down_pic(htm)

except urllib2.URLError,e:
if hasattr(e,'code'):
print 'Error code :',e.code
if hasattr(e,'reason'):
print 'Reason :',e.reason

g_worker.set_flag(True)

def get_page_num(self,htm):

match=re.search(r'<span class="red">(\d*)</span>',htm)
if match:
return int(match.group(1))
else:
return 0

def get_title(self,htm):

match=re.search(r'class="core_title_txt(\s+)"(\s+)title="(.*?)"',htm)
if match:
return match.group(3)

else:
print 'no match title'
return ''

def page_deal(self,htm):
match=re.findall(r'id="post_content_(.*?)">(.*?)</div>',htm)
if match:
for it in match:
print it[1],'\n'
else:
print 'no deal'

def down_pic(self,htm):
global g_worker
match=re.findall(r'<img class="BDE_Image" pic_type=(.*?)src="(.*?)"',htm)
if match:
for it in match:
print 'picture url :',it[1],'\n'
g_worker.push(it[1])
else:
print 'no deal'

if __name__=='__main__':

spider=Tieba_Spider(url,1)#参数1为下载图片。默认为0,功能为抓取楼主的发言在屏幕上显示
spider.start()

g_worker.start()


DownQueue.py

#coding:utf-8
import threading,Queue,re,time
import urllib2

class down(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.queue=Queue.Queue(1000)
self.semaphore=threading.Semaphore(0)
self.flag=False #是否停止

def push(self,obj):
self.queue.put(obj)
self.semaphore.release()

def set_flag(self,f):
self.flag=f

def run(self):
while True:
if self.semaphore.acquire():
obj=self.queue.get()
data=urllib2.urlopen(obj).read()
pic=re.search(r'.*/(.*)',obj)

print 'dowing ',pic.group(1)
fd=open('./spider_pic/%s'%pic.group(1),'wb')
fd.write(data)
fd.close()

if self.queue.empty() and self.flag:    #线程结束条件,队列为空并且退出标志为真
break


Tieba_Spider 类为爬虫类,负责爬出楼主发言中的图片链接,并将其推入down类的队列中。down类的工作为下载图片。两个类均继承自threading.Thread。仅供学习,转载请标明出处。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: