您的位置:首页 > 编程语言 > Python开发

python3.2 多线程小爬虫一只

2013-05-16 13:06 465 查看
# -*- coding:utf-8 -*-
"""
多线程抓取网页
"""

import gzip
from urllib import request as urllib2

import threading

from pyquery import PyQuery as pq

threads = []
web_site_url = "http://www.oschina.net/question/tag/python"  # OS CHINA 下python标签

def work(url):
"""
callback function
"""
# 出现urllib2.HTTPError: HTTP Error 403: Forbidden错误是由于网站禁止爬虫,可以在请求加上头信息,伪装成浏览器访问
# 伪装浏览器头
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
if not url:
url = web_site_url
req = urllib2.Request(url=url, headers = headers)
feed_data = urllib2.urlopen(req).read()
feed_data = gzip_decode_content(feed_data)
data = pq(feed_data)
get_next_page(data)
if data :
data("ul li.question").each(parse_html)

def parse_html(i, element ):
pq_element = pq(element)
user_img = pq_element("a.ShowUserOutline img").attr("src")
# user_name = pq_element("a.ShowUserOutline img").attr("title")
question = pq_element("div.qbody h2 a").text()
date_str = pq_element("div.qbody div.Date").text()
date_str = date_str.split(",")[0].strip()
print("%s\t%s\t%s" % (question, date_str, user_img))

def get_next_page(data):
if data :
page_li = data("ul.pager").eq(1).find("li.next")
if page_li :
page_params = page_li.find("a").attr("href")
next_page_url = web_site_url + page_params
threading.Thread(target=work, args=(next_page_url, )).start()

def gzip_decode_content(doc=""):
"""
根据URL返回内容,有些页面可能需要 gzip 解压缩
"""

try:
html = gzip.decompress(doc).decode("utf-8") #解码
except:
html=doc.decode("utf-8")
return html

def main():
work(())

if __name__ == "__main__":
main()

# import urllib.request
#
# url = "http://www.oschina.net/"
# headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')
#
# opener = urllib.request.build_opener()
# opener.addheaders = [headers]
# data = opener.open(url).read()
#
# print(data)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: