您的位置:首页 > 其它

pyspider采集例子

2017-07-04 12:33 169 查看
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-04-04 10:35:52
# Project: retries

from pyspider.libs.base_handler import *
import re

class Handler(BaseHandler):
crawl_config = {
}

def on_start(self):
self.crawl('http://www.mofangge.com/', callback=self.index_page)

@config(priority=4)
def index_page(self, response):
for each in response.doc('a[href^="http"]').items():
if re.match("http://www.mofangge.com/qlist/\w+/", each.attr.href, re.U):
self.crawl(each.attr.href, callback=self.list_page)

@config(priority=3)
def list_page(self, response):
for each in response.doc('.seoleftul A').items():
self.crawl(each.attr.href, callback=self.detail_page)

@config(priority=2)
def detail_page(self, response):
for each in response.doc('td a').items():
self.crawl(each.attr.href, callback=self.detail_page)
return {
"url": response.url,
"question": response.doc('#q_indexkuai2 table').html(),
"answer": response.doc('#q_indexkuai3 table').html(),
"subject": response.doc('body > div.content > div.nagetivebanner1 > div > span > a:nth-child(2)').html(),
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  pyspider