您的位置：首页 > 编程语言 > Python开发

Python抓取知乎答案内容

2016-08-24 18:10 387 查看

import urllib2
import re
from bs4 import BeautifulSoup

class Spider():
def __init__(self, user_agent):
self.user_agent = user_agent

def analyzeHtml(self, content):
if content is None:
print "Empty"
print content
bs = BeautifulSoup(content,"html.parser")
title = bs.title
author = bs.find_all("a",class_="author-link")
if author is not None:
for a in author:
print a
for a_name in a.strings:
print a_name
answers = bs.find_all("div", class_="zm-editable-content clearfix")
if answers is not None:
for answer in answers:
for answer_detail in answer.strings:
print answer_detail
print answer

def getContentFromHost(self, url):
header = {"User-Agent": self.user_agent}
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request)
content = response.read()
return content

if __name__ == '__main__':
host = "https://www.zhihu.com/question/48554642"
user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
spider = Spider(user_agent)
spider.analyzeHtml(spider.getContentFromHost(host))

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： python BeautifulS 爬虫

相关文章推荐

新的分享

章节导航