您的位置:首页 > 理论基础 > 计算机网络

十.scrapy项目 爬取主页http://cuiqingcai.com/获取所有url与title

2017-07-10 10:50 399 查看
一.分析采用crawlspider,利用rule规则提取url,并且follow=True追踪下去

rules = (
Rule(LinkExtractor(allow=('\d+\.html$',)), callback='parse_all', follow=True),
# Rule(LinkExtractor(allow=('\d+\.html$',)), callback='parse_pachong', follow=True),
)


二.spider为

#coding:utf-8
from scrapy.spiders import CrawlSpider, Rule, Request
from scrapy.linkextractors import LinkExtractor
from ..items import CuiqingcaiItem

class myspider(CrawlSpider):
name = 'cqc'
allowed_domains = ['cuiqingcai.com']
count_all = 0
url_all = []
start_urls = ['http://cuiqingcai.com']
label_tags = [u'爬虫', 'scrapy', 'selenium']

rules = ( Rule(LinkExtractor(allow=('\d+\.html$',)), callback='parse_all', follow=True), # Rule(LinkExtractor(allow=('\d+\.html$',)), callback='parse_pachong', follow=True), )
'''
# 将爬虫相关的数据存入数据库
def parse_pachong(self, response):
print_tag = False
title_name = u""
for tag in self.label_tags:
title_name = response.xpath('//header/h1[1][@class="article-title"]/a/text()').extract()[0]
if tag in title_name.lower().encode("utf-8"):
print_tag = True
if print_tag == True:
self.count_all = self.count_all + 1
self.url_all.append(response.url)
item = CuiqingcaiItem()
item['url'] = response.url
item['title'] = title_name.encode("utf-8")
return item
'''
# 将全站数据存入json文件
def parse_all(self, response):
title_name = None
if response.xpath('//header/h1[1][@class="article-title"]/a/text()').extract()[0]:
title_name = response.xpath('//header/h1[1][@class="article-title"]/a/text()').extract()[0]
item = CuiqingcaiItem()
item['url'] = response.url
item['title'] = title_name
return item


三.pipelines为

import json
from pymongo import MongoClient
import settings
from items import CuiqingcaiItem

class CuiqingcaiPipeline(object):
def __init__(self):
cn=MongoClient('127.0.0.1',27017)
db=cn[settings.Mongodb_DBNAME]
self.table=db[settings.Mongodb_DBTable]
def process_item(self, item, spider):
if isinstance(item,CuiqingcaiItem):
try:
self.table.insert(dict(item))
except Exception, e:
pass
            return item


四.item为

import scrapy

class CuiqingcaiItem(scrapy.Item):

title = scrapy.Field()  # 标题
url = scrapy.Field()  # 页面的地址
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: