您的位置:首页 > 编程语言 > Python开发

使用python scrapy爬虫框架 爬取科学网自然科学基金数据

2015-04-27 16:07 906 查看
使用python scrapy爬虫框架 爬取科学网自然科学基金数据

fundspider.py文件

# -*- coding: utf-8 -*-

from scrapy.selector import Selector
from fundsort.items import FundItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
import re

class FundSpider(CrawlSpider):
name = "fund"
id = 0
allowed_domains = ["fund.sciencenet.cn"]
start_urls =["http://fund.sciencenet.cn/index.php/search/project?name=&person=&no=&company=%E5%8C%97%E4%BA%AC%E5%A4%96%E5%9B%BD%E8%AF%AD%E5%A4%A7%E5%AD%A6&subject=&money1=&money2=&startTime=2005&endTime=2015&subcategory=&redract_url=&submit.x=0&submit.y=0&page=1"
]#每换个大学换一次star_urls. url结尾一定是以“page=”结束,

def parse_item(self, response):
item = response.meta['item']
sel = Selector(response)
num=self.getid()
num=str(num)
item['id']=num
item['school'] = sel.xpath('//tbody/tr[2]/td[@colspan="2"]/text()').extract()
item['subcategory']=sel.xpath('//table[@class="tb no_print"]//tbody//tr[1]/td[@colspan="4"]/text()').extract()
subcode=sel.xpath('//table[@class="tb no_print"]//tbody//tr[1]/td[@colspan="4"]/text()').extract()[0]
#subcode=str(subcode)
item['subcode']=re.findall('([A-M]\d\d)',subcode)
item['itemname'] = sel.xpath('//div[@class="v_con"]//h1/text()').extract()
item['fundmoney'] = sel.xpath('//table[@class="tb no_print"]//tbody//tr[3]/td[1]/text()').extract()
item['time'] = sel.xpath('//table[@class="tb no_print"]//tbody/tr[3]/td[3]/text()').extract()
item['principal'] = sel.xpath('//table[@class="tb no_print"]//tbody//tr[2]/td[1]/text()').extract()
item['url']=response.url
return item
def getid(self):
self.id += 1
return self.id

def parse_link(self,response):
sel = Selector(response)
item = response.meta['item']
items_url=sel.xpath('//div[@id="resultLst"]//div[position()>0]//p//a//@href').extract()
for item_url in items_url:
item_url=str(item_url)
yield Request(url=item_url,meta={'item':item}, callback=self.parse_item)

def parse(self, response):
urlline=[]
for i in range(1,17):      #此处换page的。range(1,17)表示1,2···,16  第17页不爬取
i=str(i)
a='http://fund.sciencenet.cn/index.php/search/project?name=&person=&no=&company=%E5%8C%97%E4%BA%AC%E5%A4%96%E5%9B%BD%E8%AF%AD%E5%A4%A7%E5%AD%A6&subject=&money1=&money2=&startTime=2005&endTime=2015&subcategory=&redract_url=&submit.x=0&submit.y=0&page='
#同start_urls
a=a+i
urlline.append(a)

for middle_url in urlline:
item = FundItem()
middle_url=str(middle_url)
s=re.findall('page=.{1,3}',middle_url)
s=str(s)
item['page']=re.findall('\d{1,3}',s)
yield Request(middle_url,meta={'item':item},callback=self.parse_link)


items.py

from scrapy.item import Item, Field

class FundsortItem(Item):
# define the fields for your item here like:
# name = Field()
pass

class FundItem(Item):
id=Field()
itemname = Field()
school = Field()
subcode=Field()
fundmoney=Field()
subcategory=Field()
time=Field()
principal=Field()
url=Field()
page=Field()
pipelines.py

import csv
import string

class JsonWithEncodingFundPipeline(object):
def __init__(self):
self.out=csv.writer(file('94.csv', 'wb')) #此处最容易犯错
entries=['爬取序号','大学','二级学科代码','页数','学科代码','项目名称','资助金额','起止时间','负责人','详细链接']
self.out.writerow(entries)

def process_item(self, item, spider):
line=[]
keys = ['id', 'school', 'subcode','page', 'subcategory', 'itemname', 'fundmoney', 'time', 'principal', 'url']
for key in keys:            #item.keys():   #取出字典中key
value = item.get(key)  #.encode('utf-8')  #取出key的value
value=''.join(value)
value=value.encode('utf-8')
line.append(value)
self.out.writerow(line)
return item

def spider_closed(self, spider):
self.out.close()


settings.py

import random
BOT_NAME = 'fundsort'

SPIDER_MODULES = ['fundsort.spiders']
NEWSPIDER_MODULE = 'fundsort.spiders'
ITEM_PIPELINES = ['fundsort.pipelines.JsonWithEncodingFundPipeline']

user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
i=random.choice((0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20))
USER_AGENT=user_agent_list[i]
COOKIES_ENABLED = False


抓取结果图:

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: