您的位置:首页 > 编程语言

[置顶]scrapy 知乎关键字爬虫spider代码

2017-07-06 10:03 447 查看
以下是spider部分的代码。爬知乎是需要登录的,建议使用cookie就可以了,如果需要爬的数量预计不多,请不要使用过大的线程数量,否则会过快的被封杀,需要等十几个小时账号才能重新使用,比起损失的这十几个小时的时间,即使是单线程也能够爬取很多页面了,得不偿失。

知乎是基于账号策略反爬的,换ua和ip并没用,如果需要高并发,需要采用几十个账号的方式来爬取。

1 # -*- coding: utf-8 -*-
2 import scrapy
3 from scrapy import Request
4 from scrapy import log
5 import logging
6 #from zhihu.items import ZhihuItem
7 from zhihu.items import ZhihuItem
8 from scrapy_redis.spiders import RedisSpider
9 import re
10 import json
11 import time
12
13 class BaoxianSpider(RedisSpider):       ##使用redis分布式
14
15     name = "baoxian"
16     allowed_domains = ["zhihu.com"]
17     #redis_key='baoxian:start_urls'
18     keywords='软件测试'                                        ###要爬的关键词
19     from urllib import quote
20     urlencode_keywords=quote(keywords)
21
22     start_urls = ['https://www.zhihu.com/r/search?q='+urlencode_keywords+'&type=content&offset=0'] #'https://www.zhihu.com/r/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=0'
23     def start_requests(self):
24         for url in self.start_urls:
25             yield Request(url=url, callback=self.parse,dont_filter=True)
26
27     def parse(self, response):
28         body=response.body  #{"paging":{"next":"\/r\/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=50"},"htmls"
29         #print body
30
31         #获取问题链接
32         question_href_reg=r'<div class=\\"title\\"><a target=\\"_blank\\" href=\\"\\/question\\/(.*?)\\"'
33         all_question_href=re.findall(question_href_reg,body)
34         print 'all_question_href:',all_question_href
35         for aqh in all_question_href:
36             question_href='https://www.zhihu.com/question/'+str(aqh)
37             yield Request(url=question_href, callback=self.parse_question,dont_filter=True)
38             print question_href
39
40             log.msg("question_href:%s \n list_question_page:%s"%(question_href,response.url), level=log.INFO)
41             #self.log
42         #获取下一页的链接
43
44         reg=r'{"paging":{"next":"(\\/r\\/search\?q=.*?&type=content&offset=.*?)"},"htmls"'
45         next_page=re.findall(reg,body)
46         print '下一页问题:',next_page
47         if len(next_page):
48             #print next_page[0]   #https://www.zhihu.com/r/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=10
49             next_page_url='https://www.zhihu.com'+ next_page[0].replace('\\','')
50             print 'next_page_url:',next_page_url
51             yield Request(url=next_page_url, callback=self.parse,dont_filter=True)
52             log.msg("next_page_url:%s"%next_page_url, level=log.INFO)
53
54                                            #data-type=\"Answer\"><div class=\"title\"><a target=\"_blank\" href=\"\/question\/22316395\"
55
56
57     def parse_question(self,response):                             ####问题详情页面
58         #print response.body
59
60         print 'response.url:',response.url
61         title=response.xpath('//h1[@class="QuestionHeader-title"]/text()').extract_first()
62         print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
63         print 'title:',title
64         #editableDetail":",国内的保险员说风险太大,不受法律保护什么的。大神推荐我赴港买保险吗?","visitCount"
65         reg='editableDetail":"([\s\S]*?)","visitCount"'
66         content_match=re.findall(reg,response.body)
67         if  content_match:
68             content=content_match[0]
69         else:
70             content=''               #有可能问题无具体描述
71         print 'content:',content
72         question={}
73         question['url']=response.url
74         question['title']=title
75
76         question['content']=content
77         #https://www.zhihu.com/question/19904068
78         question['comment']=[]
79         #https://www.zhihu.com/api/v4/questions/20214716/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=3&offset=3
80         answer_json='https://www.zhihu.com/api/v4/questions/'+re.findall('(\d+)',response.url)[0]+'/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=20&offset=0'
81         print 'answer_json:',answer_json
82         yield Request(url=answer_json, callback=self.parse_json,meta=question,dont_filter=False)
83         """
84         item=ZhihuItem()
85         item['title']=question['title']
86         item['url']=question['url']
87         item['content']=question['content']
88         yield item
89         print item
90         """
91
92     def parse_json(self,response):                           ####答案列表
93         meta=response.meta
94         dict=json.loads(response.body)
95
96         #print 'dict:',dict
97         print 'dcit to json:',json.dumps(dict,ensure_ascii=False)
98         comment_list=meta['comment']
99         for data  in  dict['data']:                    # dict['data']是列表,每个元素是字典
100             try:
101                 comment_dict={}
102                 comment_dict['comment_content']=data['content']
103                 if data['author']['name']:
104                     comment_dict['author']=data['author']['name']
105                 else:
106                     comment_dict['author']=''
107                 comment_dict['voteup_count']=data['voteup_count']
108                 comment_dict['comment_count']=data['comment_count']
109                 comment_dict['comment_time']=time.strftime('%Y-%m-%d',time.localtime(data['created_time']))
110                 comment_list.append(comment_dict)
111             except Exception,e:
112                 print e
113         meta['comment']=comment_list
114         meta['answer_num']=dict['paging']['totals']
115
116
117
118         if dict['paging']['is_end']==False:             ###自动翻页
119             yield Request(url=dict['paging']['next'], callback=self.parse_json,meta=meta,dont_filter=False)
120         else:
121             #log.msg("last:%s"%next_page_url, level=log.INFO)
122             print 'last:',meta['title'],meta['url'] ,meta['content'],meta['answer_num'],len(meta['comment'])#,meta['comment']
123             item=ZhihuItem()
124             item['title']=meta['title']
125             item['url']=meta['url']
126             item['content']=meta['content']
127             item['answer_num']=meta['answer_num']
128             item['comment']=meta['comment']
129             yield item


 

发下运行结果,存储用的mongodb

 

 



 

comment的内容



 
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐