scrapy框架下爬取51job网站信息,并存储到表格中
2018-03-02 17:53
330 查看
1. 通过命令创建项目
scrapy startproject JobSpider
2. 用pycharm打开项目
3. 通过命令创建爬虫
scrapy genspider job baidu.com
4. 配置settingsrobots_obey=False
Download_delay=0.5
Cookie_enable=False
可以直接粘现成的# 自己添加的获取useragent类
class JobUserAgentMiddleware(object):
"""This middleware allows spiders to override the user_agent"""
def __init__(self, user_agent='Scrapy', name=''):
self.user_agent = UserAgent()
@classmethod
def from_crawler(cls, crawler):
# o = cls(crawler.settings['USER_AGENT'],'ZhangSAN)
o = cls()
# cls后的数据会自动赋值给构造函数的对应参数
# crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
# 这个函数不能删,否则会报错
def spider_opened(self, spider):
# 等号右边代码的含义是 从spider中获得user_agent的属性
# 如果没有默认为self.user_agent的内容
# self.user_agent = getattr(spider, 'user_agent', self.user_agent)
pass
def process_request(self, request, spider):
if self.user_agent:
# b 转换为二进制,不能改
request.headers.setdefault(b'User-Agent', self.user_agent.random)
6. 开始解析数据
在Terminal终端里面,创建文件 scrapy genspider job baidu.com
1) 先大致规划一下需要几个函数
2) 函数1跳转到函数2使用 yield scrapy.Request(url,callback,meta,dont_filter)# -*- coding: utf-8 -*-
import scrapy
from ..items import JobspiderItem
# --老师--51job数据获取方法一
class JobSpider(scrapy.Spider):
name = 'job'
allowed_domains = ['51job.com']
start_urls = [
'http://search.51job.com/list/010000%252C020000%252C030200%252C040000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
'http://search.51job.com/list/010000%252C020000%252C030200%252C040000,000000,0000,00,9,99,php,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
'http://search.51job.com/list/010000%252C020000%252C030200%252C040000,000000,0000,00,9,99,html,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
]
def parse(self, response):
yield scrapy.Request(
url=response.url,
callback=self.parse_job_info,
meta={},
dont_filter=True
)
def parse_next_page(self, response):
"""
解析下一页
:param response:
:return:
"""
next_page = response.xpath("//li[@class='bk'][2]/a/@href").extract_first('')
if next_page:
yield scrapy.Request(
url=next_page,
callback=self.parse_job_info,
meta={},
dont_filter=True
)
"""
递归:如果一个函数内部自己调用自己
这种形式就叫做递归
"""
def parse_job_info(self, response):
"""
解析工作信息
:param response:
:return:
"""
job_div_list = response.xpath("//div[@id='resultList']/div[@class='el']")
for job_div in job_div_list:
job_name = job_div.xpath("p/span/a/@title").extract_first('无工作名称').strip().replace(",", "/")
job_company_name = job_div.xpath("span[@class='t2']/a/@title").extract_first('无公司名称').strip()
job_place = job_div.xpath("span[@class='t3']/text()").extract_first('无地点名称').strip()
job_salary = job_div.xpath("span[@class='t4']/text()").extract_first('面议').strip()
job_time = job_div.xpath("span[@class='t5']/text()").extract_first('无时间信息').strip()
job_type = '51job' if '51job.com' in response.url else '其它'
print(job_type, job_name, job_company_name, job_place, job_salary, job_time)
"""
数据清洗:负责清除数据两端的空格,空行,特殊符号等
常用操作一般是strip
包括清除无效数据,例如数据格式不完整的数据
以及重复的数据
"""
item = JobspiderItem()
item['job_name'] = job_name
item['job_company_name'] = job_company_name
item['job_place'] = job_place
item['job_salary'] = job_salary
item['job_time'] = job_time
item['job_type'] = job_type
item['fan_kui_lv'] = "没有反馈率"
yield item
yield scrapy.Request(
url=response.url,
callback=self.parse_next_page,
dont_filter=True,
)
7. 将数据封装到items,记得yield item# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class JobspiderItem(scrapy.Item):
# define the fields for your item here like:
job_name = scrapy.Field()
job_company_name = scrapy.Field()
job_place = scrapy.Field()
job_salary = scrapy.Field()
job_time = scrapy.Field()
job_type = scrapy.Field()
fan_kui_lv = scrapy.Field()
8. 自定义pipelines将数据存储到数据库/文件中# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html """
Pipeline:俗称管道,用于接收爬虫返回的item数据
"""
class JobspiderPipeline(object):
def process_item(self, item, spider):
return item
"""
以下代码 在settings.py中调用
"""
class ToCsvPipeline(object):
def process_item(self, item, spider):
with open("job.csv", "a", encoding="gb18030") as f:
job_name = item['job_name']
job_company_name = item['job_company_name']
job_place = item['job_place']
job_salary = item['job_salary']
job_time = item['job_time']
job_type = item['job_type']
fan_kui_lv = item['fan_kui_lv']
job_info = [job_name, job_company_name, job_place, job_salary, job_time, job_type, fan_kui_lv, "\n"]
# 逗号,换格
f.write(",".join(job_info))
# 把item传递给下一个Pipeline做处理
return item
9.执行结果如下:
生成表格:
scrapy startproject JobSpider
2. 用pycharm打开项目
3. 通过命令创建爬虫
scrapy genspider job baidu.com
4. 配置settingsrobots_obey=False
Download_delay=0.5
Cookie_enable=False
DOWNLOADER_MIDDLEWARES = { 'JobSpider.middlewares.JobUserAgentMiddleware': 543, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None }
"""调用Pipeline中自己写的类""" ITEM_PIPELINES = { 'JobSpider.pipelines.ToCsvPipeline': 300, }5. 自定义UserAgentMiddleWare
可以直接粘现成的# 自己添加的获取useragent类
class JobUserAgentMiddleware(object):
"""This middleware allows spiders to override the user_agent"""
def __init__(self, user_agent='Scrapy', name=''):
self.user_agent = UserAgent()
@classmethod
def from_crawler(cls, crawler):
# o = cls(crawler.settings['USER_AGENT'],'ZhangSAN)
o = cls()
# cls后的数据会自动赋值给构造函数的对应参数
# crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
# 这个函数不能删,否则会报错
def spider_opened(self, spider):
# 等号右边代码的含义是 从spider中获得user_agent的属性
# 如果没有默认为self.user_agent的内容
# self.user_agent = getattr(spider, 'user_agent', self.user_agent)
pass
def process_request(self, request, spider):
if self.user_agent:
# b 转换为二进制,不能改
request.headers.setdefault(b'User-Agent', self.user_agent.random)
6. 开始解析数据
在Terminal终端里面,创建文件 scrapy genspider job baidu.com
1) 先大致规划一下需要几个函数
2) 函数1跳转到函数2使用 yield scrapy.Request(url,callback,meta,dont_filter)# -*- coding: utf-8 -*-
import scrapy
from ..items import JobspiderItem
# --老师--51job数据获取方法一
class JobSpider(scrapy.Spider):
name = 'job'
allowed_domains = ['51job.com']
start_urls = [
'http://search.51job.com/list/010000%252C020000%252C030200%252C040000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
'http://search.51job.com/list/010000%252C020000%252C030200%252C040000,000000,0000,00,9,99,php,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
'http://search.51job.com/list/010000%252C020000%252C030200%252C040000,000000,0000,00,9,99,html,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
]
def parse(self, response):
yield scrapy.Request(
url=response.url,
callback=self.parse_job_info,
meta={},
dont_filter=True
)
def parse_next_page(self, response):
"""
解析下一页
:param response:
:return:
"""
next_page = response.xpath("//li[@class='bk'][2]/a/@href").extract_first('')
if next_page:
yield scrapy.Request(
url=next_page,
callback=self.parse_job_info,
meta={},
dont_filter=True
)
"""
递归:如果一个函数内部自己调用自己
这种形式就叫做递归
"""
def parse_job_info(self, response):
"""
解析工作信息
:param response:
:return:
"""
job_div_list = response.xpath("//div[@id='resultList']/div[@class='el']")
for job_div in job_div_list:
job_name = job_div.xpath("p/span/a/@title").extract_first('无工作名称').strip().replace(",", "/")
job_company_name = job_div.xpath("span[@class='t2']/a/@title").extract_first('无公司名称').strip()
job_place = job_div.xpath("span[@class='t3']/text()").extract_first('无地点名称').strip()
job_salary = job_div.xpath("span[@class='t4']/text()").extract_first('面议').strip()
job_time = job_div.xpath("span[@class='t5']/text()").extract_first('无时间信息').strip()
job_type = '51job' if '51job.com' in response.url else '其它'
print(job_type, job_name, job_company_name, job_place, job_salary, job_time)
"""
数据清洗:负责清除数据两端的空格,空行,特殊符号等
常用操作一般是strip
包括清除无效数据,例如数据格式不完整的数据
以及重复的数据
"""
item = JobspiderItem()
item['job_name'] = job_name
item['job_company_name'] = job_company_name
item['job_place'] = job_place
item['job_salary'] = job_salary
item['job_time'] = job_time
item['job_type'] = job_type
item['fan_kui_lv'] = "没有反馈率"
yield item
yield scrapy.Request(
url=response.url,
callback=self.parse_next_page,
dont_filter=True,
)
7. 将数据封装到items,记得yield item# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class JobspiderItem(scrapy.Item):
# define the fields for your item here like:
job_name = scrapy.Field()
job_company_name = scrapy.Field()
job_place = scrapy.Field()
job_salary = scrapy.Field()
job_time = scrapy.Field()
job_type = scrapy.Field()
fan_kui_lv = scrapy.Field()
8. 自定义pipelines将数据存储到数据库/文件中# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html """
Pipeline:俗称管道,用于接收爬虫返回的item数据
"""
class JobspiderPipeline(object):
def process_item(self, item, spider):
return item
"""
以下代码 在settings.py中调用
"""
class ToCsvPipeline(object):
def process_item(self, item, spider):
with open("job.csv", "a", encoding="gb18030") as f:
job_name = item['job_name']
job_company_name = item['job_company_name']
job_place = item['job_place']
job_salary = item['job_salary']
job_time = item['job_time']
job_type = item['job_type']
fan_kui_lv = item['fan_kui_lv']
job_info = [job_name, job_company_name, job_place, job_salary, job_time, job_type, fan_kui_lv, "\n"]
# 逗号,换格
f.write(",".join(job_info))
# 把item传递给下一个Pipeline做处理
return item
9.执行结果如下:
生成表格:
相关文章推荐
- 基于scrapy框架下爬取智联招聘--并把信息存储下来
- 关于scrapy爬取51job网以及智联招聘信息存储文件的设置
- 爬取腾讯招聘scrapy框架实现,并以(表格,json)形式存储到本地
- javaEE01-使用很HTML的排版标签编写“网站信息页面”,使用图片标签编写“图面显示页面”,使用列表标签编写“友情链接页面”,使用表格标签编写“首页”,框架标签表现“后台页面”
- scrapy框架爬取知乎110w用户信息,并存入mysql数据库和mongoDB数据库
- 爬虫 scrapy 框架学习 2. Scrapy框架业务逻辑的理解 + 爬虫案例 下载指定网站所有图片
- Winform开发框架中实现信息阅读状态的显示和存储
- Python爬虫框架Scrapy实战之定向批量获取职位招聘信息
- Scrapy框架结合Spynner采集需进行js,ajax动态加载的网页并提取网页信息(以采集微信公众号文章列表为例)
- ASP.NET MVC5 网站开发实践(一) - 框架(续) 模型、数据存储、业务逻辑
- Scrapy:Python3版本上安装数据挖掘必备的scrapy框架详细攻略(二最完整爬取网页内容信息攻略)——Jason niu
- 【实战\聚焦Python分布式爬虫必学框架Scrapy 打造搜索引擎项目笔记】第4章 scrapy爬取知名技术文章网站(1)
- Scrapy爬虫框架学习之自定义Pipelines将文件以Json格式存储
- 网络爬虫:使用Scrapy框架编写一个抓取书籍信息的爬虫服务
- Python爬虫框架Scrapy实战 - 抓取BOSS直聘招聘信息
- Scrapy框架结合Spynner采集需进行js,ajax动态加载的网页并提取网页信息(以采集微信公众号文章列表为例)
- Python爬虫框架Scrapy实战 - 抓取BOSS直聘招聘信息
- 配合scrapy,用请求方式抓取一些网站内容。例如抓取360手机应用APP信息。
- Scrapy框架爬取有验证码的登录网站
- Scrapy实战-爬取某博客聚合网站信息