Scrapy——模拟登陆爬取github issues
2017-09-10 20:48
218 查看
# -*- encoding: utf-8 -*- import logging import sys import scrapy from scrapy.spiders import CrawlSpider,Rule from scrapy.linkextractors import LinkExtractor from scrapy.http import Request,FormRequest,HtmlResponse # logging.basicConfig(level=logging.INFO, # format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', # datefmt='%Y-%m-%d %H:%M:%S', # handlers=[logging.StreamHandler(sys.stdout)]) class GithubSpider(CrawlSpider): name='github' allowed_domains=['github.com'] start_urls=['https://github.com/issues'] rules=[ Rule(LinkExtractor( allow= (r'/issues/\d+',) , restrict_css='ul li div div:nth-child(3) a:nth-child(2)' ),callback='parse_page' ) ] posts_headers={ 'Host': 'github.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Content-Type': 'application/x-www-form-urlencoded', ' Referer': 'https://github.com/', 'Connection': 'keep-alive' } def start_requests(self): return [Request('https://github.com/login',meta={'cookiejar':1},callback=self.parse_login)] def parse_login(self,response): authenticity_token=response.css('form input[name="authenticity_token"]::attr(value)').extract_first() logging.info('authenticity_token:'+authenticity_token) return [FormRequest.from_response( response, url='https://github.com/session', headers=self.posts_headers, meta={'cookiejar':response.meta['cookiejar']}, formdata={ 'commit':'Sign+in', 'utf8':'✓', 'authenticity_token':authenticity_token, 'login':'742790905@qq.com', 'password':'*****' }, callback=self.after_login, dont_filter=True )] def after_login(self,response): for url in self.start_urls: # 因为我们上面定义了Rule,所以只需要简单的生成初始爬取Request即可,调用parse衔接rules yield Request(url, meta={'cookiejar': response.meta['cookiejar']}) def parse_page(self,response): logging.info(u'--------------消息分割线-----------------') logging.info(response.url) issue_title = response.xpath( '//span[@class="js-issue-title"]/text()').extract_first() logging.info(u'issue_title:' + issue_title.encode('utf-8')) # def _requests_to_follow(self, response): # """重写加入cookiejar的更新""" # if not isinstance(response, HtmlResponse): # return # seen = set() # for n, rule in enumerate(self._rules): # links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] # if links and rule.process_links: # links = rule.process_links(links) # for link in links: # seen.add(link) # r = Request(url=link.url, callback=self._response_downloaded) # # 下面这句是我重写的 # r.meta.update(rule=n, link_text=link.text, cookiejar=response.meta['cookiejar']) # yield rule.process_request(r)
相关文章推荐
- 运维学python之爬虫高级篇(六)scrapy模拟登陆GitHub和51cto
- (案例六)三种scrapy模拟登陆策略
- 使用scrapy 模拟登陆网站后 抓取会员中心相关信息
- [Scrapy]模拟登陆并获取Cookie值
- Scrapy中用cookie模拟登陆新浪微博
- [Scrapy]模拟登陆并获取Cookie值
- 使用scrapy模拟登陆一般登陆不了的网站[已解决]
- scrapy框架半自动处理验证码豆瓣网模拟登陆
- Scrapy三种模拟登陆策略
- 第三百四十三节,Python分布式爬虫打造搜索引擎Scrapy精讲—scrapy模拟登陆和知乎倒立文字验证码识别
- 【scrapy】模拟登陆知乎
- Scrapy模拟登录GitHub
- scrapy 模拟登陆
- python3 scrapy模拟登陆豆瓣
- Scrapy框架学习(五)----Request、Response介绍及模拟GitHub登录
- scrapy 的crawl模板模拟登陆
- python scrapy爬取生物谷之模拟登陆(使用FormRequest)
- php curl 模拟登陆https
- php中通过curl模拟登陆discuz论坛的实现代码