使用python scrapy爬虫框架 爬取科学网自然科学基金数据
2015-04-27 16:07
906 查看
使用python scrapy爬虫框架 爬取科学网自然科学基金数据
fundspider.py文件
items.py
settings.py
抓取结果图:
fundspider.py文件
# -*- coding: utf-8 -*- from scrapy.selector import Selector from fundsort.items import FundItem from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.http import Request import re class FundSpider(CrawlSpider): name = "fund" id = 0 allowed_domains = ["fund.sciencenet.cn"] start_urls =["http://fund.sciencenet.cn/index.php/search/project?name=&person=&no=&company=%E5%8C%97%E4%BA%AC%E5%A4%96%E5%9B%BD%E8%AF%AD%E5%A4%A7%E5%AD%A6&subject=&money1=&money2=&startTime=2005&endTime=2015&subcategory=&redract_url=&submit.x=0&submit.y=0&page=1" ]#每换个大学换一次star_urls. url结尾一定是以“page=”结束, def parse_item(self, response): item = response.meta['item'] sel = Selector(response) num=self.getid() num=str(num) item['id']=num item['school'] = sel.xpath('//tbody/tr[2]/td[@colspan="2"]/text()').extract() item['subcategory']=sel.xpath('//table[@class="tb no_print"]//tbody//tr[1]/td[@colspan="4"]/text()').extract() subcode=sel.xpath('//table[@class="tb no_print"]//tbody//tr[1]/td[@colspan="4"]/text()').extract()[0] #subcode=str(subcode) item['subcode']=re.findall('([A-M]\d\d)',subcode) item['itemname'] = sel.xpath('//div[@class="v_con"]//h1/text()').extract() item['fundmoney'] = sel.xpath('//table[@class="tb no_print"]//tbody//tr[3]/td[1]/text()').extract() item['time'] = sel.xpath('//table[@class="tb no_print"]//tbody/tr[3]/td[3]/text()').extract() item['principal'] = sel.xpath('//table[@class="tb no_print"]//tbody//tr[2]/td[1]/text()').extract() item['url']=response.url return item def getid(self): self.id += 1 return self.id def parse_link(self,response): sel = Selector(response) item = response.meta['item'] items_url=sel.xpath('//div[@id="resultLst"]//div[position()>0]//p//a//@href').extract() for item_url in items_url: item_url=str(item_url) yield Request(url=item_url,meta={'item':item}, callback=self.parse_item) def parse(self, response): urlline=[] for i in range(1,17): #此处换page的。range(1,17)表示1,2···,16 第17页不爬取 i=str(i) a='http://fund.sciencenet.cn/index.php/search/project?name=&person=&no=&company=%E5%8C%97%E4%BA%AC%E5%A4%96%E5%9B%BD%E8%AF%AD%E5%A4%A7%E5%AD%A6&subject=&money1=&money2=&startTime=2005&endTime=2015&subcategory=&redract_url=&submit.x=0&submit.y=0&page=' #同start_urls a=a+i urlline.append(a) for middle_url in urlline: item = FundItem() middle_url=str(middle_url) s=re.findall('page=.{1,3}',middle_url) s=str(s) item['page']=re.findall('\d{1,3}',s) yield Request(middle_url,meta={'item':item},callback=self.parse_link)
items.py
from scrapy.item import Item, Field class FundsortItem(Item): # define the fields for your item here like: # name = Field() pass class FundItem(Item): id=Field() itemname = Field() school = Field() subcode=Field() fundmoney=Field() subcategory=Field() time=Field() principal=Field() url=Field() page=Field()pipelines.py
import csv import string class JsonWithEncodingFundPipeline(object): def __init__(self): self.out=csv.writer(file('94.csv', 'wb')) #此处最容易犯错 entries=['爬取序号','大学','二级学科代码','页数','学科代码','项目名称','资助金额','起止时间','负责人','详细链接'] self.out.writerow(entries) def process_item(self, item, spider): line=[] keys = ['id', 'school', 'subcode','page', 'subcategory', 'itemname', 'fundmoney', 'time', 'principal', 'url'] for key in keys: #item.keys(): #取出字典中key value = item.get(key) #.encode('utf-8') #取出key的value value=''.join(value) value=value.encode('utf-8') line.append(value) self.out.writerow(line) return item def spider_closed(self, spider): self.out.close()
settings.py
import random BOT_NAME = 'fundsort' SPIDER_MODULES = ['fundsort.spiders'] NEWSPIDER_MODULE = 'fundsort.spiders' ITEM_PIPELINES = ['fundsort.pipelines.JsonWithEncodingFundPipeline'] user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] i=random.choice((0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)) USER_AGENT=user_agent_list[i] COOKIES_ENABLED = False
抓取结果图:
相关文章推荐
- 搜索引擎–Python下开源爬虫(spider)框架scrapy的使用
- Python爬虫框架Scrapy安装使用步骤
- Python爬虫框架Scrapy安装使用步骤
- 零基础写python爬虫之使用Scrapy框架编写爬虫
- 零基础写python爬虫之使用Scrapy框架编写爬虫
- Python爬虫框架Scrapy 学习笔记 5 ------- 使用pipelines过滤敏感词
- 零基础写python爬虫之使用Scrapy框架编写爬虫
- Python 爬虫 正则抽取网页数据和Scrapy简单使用
- 讲解Python的Scrapy爬虫框架使用代理进行采集的方法
- 讲解Python的Scrapy爬虫框架使用代理进行采集的方法
- Python爬虫框架Scrapy实战之抓取户外数据
- 使用Python的Scrapy框架编写web爬虫的简单示例
- Python爬虫框架Scrapy安装使用步骤
- windows下使用python的scrapy爬虫框架,爬取个人博客文章内容信息
- 零基础写python爬虫之使用Scrapy框架编写爬虫
- 零基础写python爬虫之使用Scrapy框架编写爬虫
- 零基础写python爬虫之使用Scrapy框架编写爬虫
- [Python]使用Scrapy爬虫框架简单爬取图片并保存本地
- Python之Scrapy爬虫框架安装及简单使用
- Python使用Scrapy爬虫框架爬取天涯社区小说“大宗师”全文